# Scratch work for model selection

In [1]:
import pandas as pd
#import numpy as np
import pickle
import time
import gc

from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('../../../data/prd/Paper/coherence_vars.sav', 'rb')
[id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs, not needed for coherence function
# id2word - dictionary
# docs - df["final_tokens"]

In [9]:
len(docs)

1143869

In [3]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(topic_term_dist, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(topic_term_dist):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [4]:
# input needed for LDA, NMF (from Scikit-Learn) is one string per document (not a list of strings)

text = []

for abstract in docs:
    text.append(" ".join(abstract))

In [5]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

#### LDA

In [6]:
# create document-term matrix

stop_wds = ['research', 'aim', 'project']  # study will be eliminated by max_df

vectorizer = CountVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
doc_term_matrix = vectorizer.fit_transform(text)

In [7]:
# create model
num_topics = 5

t1 = time.time()
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics,
                                      topic_word_prior=0.1, n_jobs=19, random_state = 0)
doc_topic = lda_model.fit_transform(doc_term_matrix)
t2 = time.time()
print(f"  Model time: {t2-t1}")

topic_term = lda_model.components_

  Model time: 1314.905624628067


In [10]:
1315/60

21.916666666666668

In [9]:
print_topics(topic_term, vectorizer, 10)


Topic 0:
('program', 523667.09705015284)
('core', 462229.04216242203)
('provide', 450636.7675416922)
('support', 362805.20931679086)
('new', 340100.3987703617)
('center', 297055.4796595892)
('include', 288326.0385743985)
('clinical', 284798.24958029256)
('student', 273517.20409035846)
('develop', 265742.21541210474)

Topic 1:
('health', 571990.2636541908)
('intervention', 314393.81538036565)
('program', 276429.2213613038)
('training', 270500.1748114237)
('patient', 261967.87126937282)
('care', 257389.0147080161)
('treatment', 256916.51116136395)
('use', 249882.1017584784)
('risk', 234978.40351242007)
('child', 222150.8447651066)

Topic 2:
('develop', 278615.33372304833)
('datum', 271970.30330009817)
('genetic', 241175.77406328474)
('disease', 239226.05625092413)
('gene', 232663.10147768827)
('model', 220212.59278326805)
('identify', 209517.31589152484)
('method', 207926.7360375005)
('risk', 199421.89608543157)
('analysis', 199068.17085353844)

Topic 3:
('protein', 619160.2913727224)
(

#### NMF

In [6]:
# create document-term matrix - TFIDF 

stop_wds = ['research', 'aim', 'project']  # study will be eliminated by max_df

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
tf_idf = tfidf_vectorizer.fit_transform(text)

In [7]:
# create model
num_topics = 5

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 0)
doc_topic = nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model time: {t2-t1}")

topic_term = nmf_model.components_              

  Model time: 181.36343955993652


In [8]:
print_topics(topic_term, tfidf_vectorizer, 10)


Topic 0:
('cell', 5.804278653231487)
('protein', 2.5874489185198324)
('gene', 2.1913886285816706)
('mouse', 1.687654517628736)
('mechanism', 1.5966446198216455)
('signal', 1.5643679493998646)
('function', 1.5074694048657473)
('disease', 1.4439077372461109)
('expression', 1.4242018217428354)
('human', 1.4057081856526463)

Topic 1:
('program', 3.4646841659512098)
('student', 3.097402131750013)
('training', 3.0177176689205143)
('trainee', 1.890557249680917)
('faculty', 1.5434548481104549)
('science', 1.5107758361476762)
('career', 1.2774962029030215)
('graduate', 1.1258170254988398)
('mentor', 1.0529789457082235)
('university', 1.0053071972181338)

Topic 2:
('health', 1.499791620712075)
('hiv', 1.4146585128862277)
('intervention', 1.3354337696275536)
('patient', 1.2966490708641691)
('risk', 1.2830805511885388)
('care', 1.1159674224691325)
('datum', 1.1119143353819354)
('child', 1.062983138961337)
('treatment', 1.0305846741790006)
('use', 0.9386092805913568)

Topic 3:
('cancer', 6.8571033

In [11]:
list(range(5,51,5))

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

#### View Results

In [2]:
f = open('results/NMF/nmf_t7-9.pkl', 'rb')
nmf_t = pickle.load(f)
f.close()

In [3]:
nmf_t

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,152.779625,163.593867,157.793356
10,268.268722,243.787429,255.55407
15,493.303392,461.374807,565.622673
20,531.956342,1022.886506,1046.611888
25,1059.765064,1320.349235,1356.896534
30,1593.825921,1553.765904,1583.867026
35,1540.430326,1520.035944,1942.665938
40,1133.344528,1625.961456,1428.509186
45,1501.716046,2396.276008,2521.878382
50,2659.459015,1695.634787,2801.905075


In [4]:
nmf_t["iteration 7"].sum()/60/60

3.0374580506483713

In [5]:
f = open('results/NMF/nmf_topics7-9.pkl', 'rb')
nmf_top = pickle.load(f)
f.close()

In [6]:
nmf_top

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,"[[cell, protein, gene, mouse, mechanism, signa...","[[cell, protein, gene, mouse, mechanism, signa...","[[cell, protein, gene, mouse, mechanism, signa..."
10,"[[cell, immune, tumor, stem, response, mouse, ...","[[cell, immune, tumor, stem, response, mouse, ...","[[cell, immune, tumor, stem, response, mouse, ..."
15,"[[cell, stem, tumor, differentiation, tissue, ...","[[cell, stem, tumor, differentiation, tissue, ...","[[cell, stem, tumor, differentiation, tissue, ..."
20,"[[cell, stem, differentiation, tissue, signal,...","[[cell, stem, differentiation, tissue, signal,...","[[cell, stem, differentiation, tissue, signal,..."
25,"[[cell, stem, differentiation, signal, tissue,...","[[cell, stem, differentiation, signal, tissue,...","[[cell, stem, differentiation, signal, tissue,..."
30,"[[cell, stem, differentiation, tissue, progeni...","[[cell, stem, differentiation, tissue, progeni...","[[cell, stem, differentiation, signal, tissue,..."
35,"[[cell, stem, differentiation, tissue, progeni...","[[cell, stem, differentiation, tissue, progeni...","[[cell, antigen, type, differentiation, tissue..."
40,"[[cell, stem, differentiation, tissue, progeni...","[[cell, stem, differentiation, tissue, progeni...","[[cell, stem, differentiation, tissue, progeni..."
45,"[[cell, antigen, type, differentiation, cd4, t...","[[cell, antigen, type, differentiation, cd4, t...","[[cell, antigen, type, differentiation, cd4, t..."
50,"[[cell, stem, differentiation, tissue, progeni...","[[cell, antigen, tissue, type, cd4, differenti...","[[cell, stem, differentiation, tissue, progeni..."


In [7]:
nmf_top["iteration 7"][5]

[['cell',
  'protein',
  'gene',
  'mouse',
  'mechanism',
  'signal',
  'function',
  'disease',
  'expression',
  'human'],
 ['program',
  'student',
  'training',
  'trainee',
  'faculty',
  'science',
  'career',
  'graduate',
  'mentor',
  'university'],
 ['health',
  'hiv',
  'intervention',
  'patient',
  'risk',
  'care',
  'datum',
  'child',
  'treatment',
  'use'],
 ['cancer',
  'tumor',
  'breast',
  'prostate',
  'patient',
  'clinical',
  'therapy',
  'treatment',
  'lung',
  'metastasis'],
 ['core',
  'administrative',
  'center',
  'investigator',
  'support',
  'provide',
  'service',
  'resource',
  'analysis',
  'datum']]

In [8]:
f = open('results/LDA/lda_t7-9.pkl', 'rb')
lda_t = pickle.load(f)
f.close()

In [9]:
lda_t

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,435.932172,437.597362,433.935202
10,551.346491,567.117702,535.620089
15,583.455645,590.12704,566.917139
20,624.802642,616.598282,613.742041
25,654.983548,663.821654,635.227189
30,701.708098,664.76257,691.421715
35,738.484941,726.303918,754.729724
40,741.158344,742.826689,780.154899
45,814.057533,797.21107,812.083758
50,813.682839,845.275442,834.834809


In [10]:
lda_t["iteration 7"].sum()/60/60

1.8498922928174337

In [11]:
f = open('results/LDA/lda_p7-9.pkl', 'rb')
lda_p = pickle.load(f)
f.close()

In [12]:
lda_p

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,2696.849974,2673.91288,2671.405578
10,2406.616979,2434.804312,2393.576193
15,2239.086827,2246.643285,2240.215003
20,2156.657389,2151.528615,2136.315159
25,2074.815944,2088.627053,2082.911066
30,2050.798042,2037.732953,2018.691059
35,1985.391977,1990.737078,2001.894329
40,1943.839797,1952.872967,1936.07631
45,1923.358611,1937.834389,1926.169997
50,1916.132948,1906.847762,1881.329188


In [13]:
f = open('results/LDA/lda_topics7-9.pkl', 'rb')
lda_top = pickle.load(f)
f.close()

In [14]:
lda_top

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,"[[patient, disease, clinical, treatment, risk,...","[[brain, neuron, function, mechanism, model, d...","[[develop, system, new, method, datum, model, ..."
10,"[[disease, injury, mechanism, model, ad, brain...","[[health, risk, intervention, child, use, trea...","[[cell, cancer, tumor, target, pathway, signal..."
15,"[[core, datum, provide, analysis, support, cen...","[[risk, health, age, exposure, factor, disease...","[[environmental, change, food, plant, water, c..."
20,"[[genetic, risk, gene, identify, factor, datum...","[[cell, cancer, tumor, patient, breast, therap...","[[cell, gene, protein, signal, regulate, funct..."
25,"[[injury, heart, vascular, mechanism, cardiac,...","[[datum, model, method, develop, analysis, net...","[[structure, new, material, develop, chemical,..."
30,"[[genetic, gene, risk, identify, disease, vari...","[[cell, immune, response, vaccine, antigen, in...","[[age, function, activity, aging, memory, neur..."
35,"[[cell, mouse, induce, injury, inflammatory, v...","[[gene, expression, cell, protein, mechanism, ...","[[cell, tissue, mechanical, property, surface,..."
40,"[[protein, structure, complex, bind, cell, dna...","[[tumor, cancer, cell, target, therapy, treatm...","[[cell, stem, mouse, tissue, development, diff..."
45,"[[clinical, center, support, investigator, tri...","[[ad, age, disease, aging, mouse, alzheimer, m...","[[cell, tissue, model, stem, mechanical, regen..."
50,"[[hiv, use, risk, drug, aids, woman, preventio...","[[compound, drug, new, molecule, chemical, dev...","[[pain, injury, stroke, brain, chronic, treatm..."


In [16]:
lda_top["iteration 7"][5]

[['patient',
  'disease',
  'clinical',
  'treatment',
  'risk',
  'develop',
  'cancer',
  'therapy',
  'drug',
  'identify'],
 ['cell',
  'protein',
  'gene',
  'mechanism',
  'mouse',
  'role',
  'cancer',
  'specific',
  'expression',
  'function'],
 ['system',
  'develop',
  'new',
  'datum',
  'provide',
  'method',
  'model',
  'high',
  'technology',
  'use'],
 ['program',
  'health',
  'training',
  'core',
  'provide',
  'support',
  'clinical',
  'center',
  'include',
  'intervention'],
 ['brain',
  'mechanism',
  'effect',
  'function',
  'neuron',
  'increase',
  'determine',
  'test',
  'disorder',
  'specific']]

#### Coherence

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('../../../data/prd/Paper/coherence_vars.sav', 'rb')
[id2word, docs] = pickle.load(f)
f.close()

# import topics

df_topics = pd.read_pickle("./results/NMF/nmf_topics0.pkl")
nrow, ncol = df_topics.shape


print("data ingested--------------------------", flush = True)

data ingested--------------------------


In [None]:
# corpus - word frequency in docs - not needed for coherence calculation
# id2word - dictionary
# docs - df["final_tokens"]

# calculate coherence

n_topics = list(range(5,51,5))
batch = 0

col_names = [f"iteration {i+batch}" for i in range(ncol)]
co_val = pd.DataFrame(index = n_topics, columns = col_names)
co_t = pd.DataFrame(index = n_topics, columns = col_names)

for j in range(ncol):
    
    print(f'Iteration {j}', flush = True)
    
    coherence_values = []
    coherence_time = []
    
    for i in range(nrow): 
            
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=df_topics.iloc[i,j], dictionary=id2word, texts=docs, coherence='c_v',
                            processes=10) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        coherence_time.append(t2-t1)
        print(f"  Coherence time: {t2-t1}", flush=True)
        
        # output completion message
        print('Number of topics =', df_topics.index[i], "complete.", flush = True)    
    
    # save results
    co_val[f"iteration {j+batch}"] = coherence_values
    co_t[f"iteration {j+batch}"] = coherence_time
    
       
        
# save results 

#co_val.to_pickle("./results/NMF/co_nmf_val0.pkl")
#co_t.to_pickle("./results/NMF/co_nmf_t0.pkl")


Iteration 0
  Coherence time: 168.43822956085205
Number of topics = 5 complete.


In [3]:
t1 = time.time()
cm = CoherenceModel(topics=df_topics.iloc[9,0], dictionary=id2word, texts=docs, coherence='c_v',
                    processes=15) 
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}", flush=True)

0.7165704142062844
  Coherence time: 472.81356859207153


In [4]:
5000/60

83.33333333333333

In [27]:
f = open('results/NMF/co_nmf_t1-3.pkl', 'rb')
co_nmf_t = pickle.load(f)
f.close()

In [28]:
co_nmf_t

Unnamed: 0,iteration 1,iteration 2,iteration 3
5,114.861114,117.141293,118.460506
10,151.909935,154.87896,155.231587
15,186.287828,187.699493,183.993934
20,221.522196,223.751019,224.886253
25,265.845104,260.652219,258.791718
30,302.753483,294.213289,296.886941
35,340.130318,340.10169,343.74622
40,371.994711,438.338857,372.235898
45,402.504977,408.179972,398.762976
50,438.697223,460.142708,479.154986


In [29]:
co_nmf_t["iteration 1"].sum()/60

46.6084481716156

In [30]:
f = open('results/NMF/co_nmf_val1-3.pkl', 'rb')
co_nmf_val = pickle.load(f)
f.close()

In [31]:
co_nmf_val

Unnamed: 0,iteration 1,iteration 2,iteration 3
5,0.681432,0.681432,0.681432
10,0.69306,0.69306,0.69306
15,0.696781,0.696781,0.714896
20,0.701793,0.701793,0.701793
25,0.69945,0.700078,0.701351
30,0.710418,0.712199,0.711775
35,0.715832,0.717924,0.715673
40,0.72535,0.718505,0.717401
45,0.714924,0.710819,0.723499
50,0.729898,0.727082,0.714613


In [43]:
f = open('results/LDA/co_lda_t7-9.pkl', 'rb')
co_lda_t = pickle.load(f)
f.close()

In [44]:
co_lda_t

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,132.882011,128.954055,135.533142
10,159.342826,164.274541,174.843091
15,204.871938,203.434997,214.461239
20,234.797601,248.812047,235.199594
25,276.729139,287.510907,272.260181
30,275.352768,312.833472,305.456198
35,323.491602,325.61796,377.233967
40,355.038644,358.54953,347.439366
45,372.149192,400.452527,396.407637
50,419.535974,431.629274,445.325002


In [45]:
co_lda_t["iteration 7"].sum()/60

45.90319490432739

In [46]:
f = open('results/LDA/co_lda_val7-9.pkl', 'rb')
co_lda_val = pickle.load(f)
f.close()

In [47]:
co_lda_val

Unnamed: 0,iteration 7,iteration 8,iteration 9
5,0.520377,0.527728,0.511135
10,0.552631,0.53755,0.545996
15,0.571558,0.562548,0.580297
20,0.579274,0.581458,0.574645
25,0.587654,0.569543,0.582239
30,0.580807,0.582737,0.583099
35,0.596405,0.594299,0.592964
40,0.599051,0.59142,0.603519
45,0.599457,0.598102,0.598659
50,0.591788,0.605955,0.605674
