# Investigate randomness of NMF and LSA

In [121]:
import pandas as pd
import numpy as np
import pickle
import matplotlib.pyplot as plt
import gensim
import time

from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [122]:
# import NSF data
f = open('../../data/prd/RND Topic Modelling/nsf_stanford_lemma.sav', 'rb')

# import entire dataset
#f = open('/project/biocomplexity/sdad/projects_data/ncses/prd/RND Topic Modelling/lda_data_stanford_lemma.sav', 'rb')

[corpus, id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs
# id2word - dictionary
# docs - lemmatized abstracts

In [123]:
# input needed here is one string per document (not a list of strings)

text = []
i=0
for doc in docs:
    text.append(" ".join(doc))

In [124]:
# create doc-term tfidf matrix

tfidf_vectorizer = TfidfVectorizer(max_df=0.4, min_df=3, lowercase=False, max_features=int(len(docs)/2))
tf_idf = tfidf_vectorizer.fit_transform(text)

In [102]:
nmf_model = NMF(n_components=5, random_state=5)
W = nmf_model.fit_transform(tf_idf)
H = nmf_model.components_

In [114]:
nmf_model2 = NMF(n_components=6, random_state=5)
W2 = nmf_model2.fit_transform(tf_idf)
H2 = nmf_model2.components_

In [115]:
W

array([[0.02012892, 0.01615863, 0.        , 0.        , 0.        ],
       [0.00622214, 0.03097469, 0.003554  , 0.        , 0.00362708],
       [0.00395395, 0.01747157, 0.        , 0.00976837, 0.01302049],
       ...,
       [0.00254561, 0.        , 0.        , 0.02852103, 0.        ],
       [0.002533  , 0.        , 0.        , 0.02837237, 0.        ],
       [0.0241984 , 0.        , 0.0130397 , 0.        , 0.        ]])

In [116]:
W2

array([[1.61829526e-02, 1.55547866e-02, 0.00000000e+00, 0.00000000e+00,
        0.00000000e+00, 1.09278123e-02],
       [9.02402473e-03, 2.83482114e-02, 3.40100632e-03, 0.00000000e+00,
        5.73195524e-03, 0.00000000e+00],
       [5.60817441e-03, 1.60504396e-02, 0.00000000e+00, 9.46092935e-03,
        1.43719338e-02, 0.00000000e+00],
       ...,
       [2.14654110e-03, 0.00000000e+00, 0.00000000e+00, 2.85288628e-02,
        7.76560061e-05, 7.08583167e-04],
       [2.13611212e-03, 0.00000000e+00, 0.00000000e+00, 2.83801799e-02,
        7.75415973e-05, 7.04574223e-04],
       [2.61325813e-02, 0.00000000e+00, 1.38863813e-02, 0.00000000e+00,
        0.00000000e+00, 0.00000000e+00]])

In [117]:
diffW = W - W2[:,0:5]
np.linalg.norm(diffW, ord='fro')

2.262035151051824

In [118]:
H

array([[4.40438386e-04, 3.58146248e-04, 3.19243421e-04, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.65885981e-04, 0.00000000e+00, 0.00000000e+00, ...,
        2.50389808e-03, 1.41319987e-04, 9.16197417e-04],
       [4.35694172e-04, 1.22889673e-03, 0.00000000e+00, ...,
        0.00000000e+00, 4.79349966e-04, 8.73525278e-06],
       [0.00000000e+00, 0.00000000e+00, 1.43772768e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [119]:
H2

array([[3.66185699e-04, 6.58070844e-04, 3.48850936e-04, ...,
        2.92115443e-04, 0.00000000e+00, 6.20818137e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.44168726e-04, 0.00000000e+00, 0.00000000e+00, ...,
        2.51936864e-03, 1.60770033e-04, 9.45296778e-04],
       [4.55559772e-04, 1.15546881e-03, 0.00000000e+00, ...,
        0.00000000e+00, 4.79295443e-04, 2.56014253e-07],
       [0.00000000e+00, 0.00000000e+00, 1.49132109e-03, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [1.59283543e-04, 0.00000000e+00, 9.94401949e-05, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [120]:
diffH = H - H2[0:5,:]
np.linalg.norm(diffH, ord='fro')

2.54087908775569

In [135]:
lsa_model = TruncatedSVD(n_components=5, random_state=10)
USigma = lsa_model.fit_transform(tf_idf)
Vtrans = lsa_model.components_

In [141]:
lsa_model2 = TruncatedSVD(n_components=6, random_state=10)
USigma2 = lsa_model2.fit_transform(tf_idf)
Vtrans2 = lsa_model2.components_

In [136]:
USigma

array([[ 0.14082944,  0.01739611,  0.05835951, -0.06836009, -0.04442716],
       [ 0.15471062,  0.09889438,  0.00387851, -0.04489336, -0.05556939],
       [ 0.13669578,  0.07566367,  0.0261642 ,  0.01835999,  0.00865936],
       ...,
       [ 0.09866348, -0.02580425,  0.02604446,  0.0869921 , -0.01569363],
       [ 0.09866348, -0.02580425,  0.02604446,  0.0869921 , -0.01569363],
       [ 0.1505098 , -0.08536802,  0.01429919, -0.0773828 , -0.06871124]])

In [142]:
USigma2

array([[ 0.14082943,  0.01743983,  0.05779215, -0.06888674, -0.03958534,
        -0.07379267],
       [ 0.1547106 ,  0.0988097 ,  0.00332828, -0.04465528, -0.05262656,
         0.01110419],
       [ 0.13669578,  0.0756351 ,  0.02625105,  0.01736296,  0.01035576,
         0.0077622 ],
       ...,
       [ 0.0986634 , -0.02579484,  0.02593745,  0.08667647, -0.01453376,
         0.0172314 ],
       [ 0.0986634 , -0.02579484,  0.02593745,  0.08667647, -0.01453376,
         0.0172314 ],
       [ 0.15050973, -0.08535227,  0.01390051, -0.0781688 , -0.06768319,
         0.04046016]])

In [144]:
diffU = USigma - USigma2[:,0:5]
np.linalg.norm(diffU, ord='fro')

0.7559543545785241

In [139]:
Vtrans

array([[ 5.75109568e-05,  7.26446972e-05,  6.50207328e-05, ...,
         1.66712976e-04,  2.52948106e-05,  6.13206285e-05],
       [-8.22549670e-05, -1.09435907e-05,  1.02640729e-04, ...,
        -2.41435151e-04, -3.84924696e-05, -9.51282321e-05],
       [ 3.57767199e-05,  2.35890927e-04,  9.97228735e-05, ...,
        -3.82333078e-04, -4.63698876e-05, -1.36729305e-04],
       [ 2.09199055e-05,  2.54512284e-04, -8.25894898e-05, ...,
        -1.92640584e-04,  1.13191998e-04, -3.83117021e-05],
       [-3.63200274e-05, -3.77534143e-05,  2.40397979e-04, ...,
        -1.73557597e-04, -1.77387945e-05, -5.05994835e-05]])

In [143]:
Vtrans2

array([[ 5.75109557e-05,  7.26446918e-05,  6.50207231e-05, ...,
         1.66712960e-04,  2.52948094e-05,  6.13206232e-05],
       [-8.22548065e-05, -1.10915548e-05,  1.02734132e-04, ...,
        -2.41631040e-04, -3.84984150e-05, -9.50978162e-05],
       [ 3.58144572e-05,  2.36339730e-04,  9.99844844e-05, ...,
        -3.82456948e-04, -4.59434074e-05, -1.37176951e-04],
       [ 2.01437665e-05,  2.55606305e-04, -8.24582757e-05, ...,
        -1.93873449e-04,  1.13669593e-04, -3.88767345e-05],
       [-3.59445991e-05, -4.44656952e-05,  2.41969902e-04, ...,
        -1.76767807e-04, -1.60779953e-05, -5.02223962e-05],
       [-2.21111340e-06,  9.16994248e-05,  1.49867338e-04, ...,
         2.76411346e-04,  3.58740297e-05,  8.17480220e-05]])

In [145]:
diffV = Vtrans - Vtrans2[0:5,:]
np.linalg.norm(diffV, ord='fro')

0.0231981646456215