# Scratch work for model selection

In [1]:
import pandas as pd
#import numpy as np
import pickle
import time
import gc

from sklearn.decomposition import LatentDirichletAllocation, NMF
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from gensim.models.coherencemodel import CoherenceModel

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('../../../data/prd/Paper/coherence_vars.sav', 'rb')
[id2word, docs] = pickle.load(f)
f.close()

# corpus - word frequency in docs, not needed for coherence function
# id2word - dictionary
# docs - df["final_tokens"]

In [9]:
len(docs)

1143869

In [3]:
# Function to format topics as a "list of list of strings".
# Needed for topic coherence function in Gensim

# function modified from https://nlpforhackers.io/topic-modeling/

def list_topics(topic_term_dist, vectorizer, top_n=10):

    #input. top_n: how many words to list per topic.  If -1, then list all words.
       
    topic_words = []
    
    for idx, topic in enumerate(topic_term_dist):  # loop through each row of H.  idx = row index.  topic = actual row
            
        if top_n == -1:   
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[::-1]])
        else:
            topic_words.append([vectorizer.get_feature_names()[i] for i in topic.argsort()[:-top_n - 1:-1]])
        
    return topic_words

In [4]:
# input needed for LDA, NMF (from Scikit-Learn) is one string per document (not a list of strings)

text = []

for abstract in docs:
    text.append(" ".join(abstract))

In [5]:
# function slightly modified from https://nlpforhackers.io/topic-modeling/

def print_topics(topic_term_mat, vectorizer, top_n=10):
    for idx, topic in enumerate(topic_term_mat):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
        #print([(vectorizer.get_feature_names()[i], topic[i])  # printing out words corresponding to indices found in next line
                        #for i in topic.argsort()[:-top_n - 1:-1]])  # finding indices of top words in topic
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
        

#### LDA

In [6]:
# create document-term matrix

stop_wds = ['research', 'aim', 'project']  # study will be eliminated by max_df

vectorizer = CountVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
doc_term_matrix = vectorizer.fit_transform(text)

In [7]:
# create model
num_topics = 5

t1 = time.time()
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics,
                                      topic_word_prior=0.1, n_jobs=19, random_state = 0)
doc_topic = lda_model.fit_transform(doc_term_matrix)
t2 = time.time()
print(f"  Model time: {t2-t1}")

topic_term = lda_model.components_

  Model time: 1314.905624628067


In [10]:
1315/60

21.916666666666668

In [9]:
print_topics(topic_term, vectorizer, 10)


Topic 0:
('program', 523667.09705015284)
('core', 462229.04216242203)
('provide', 450636.7675416922)
('support', 362805.20931679086)
('new', 340100.3987703617)
('center', 297055.4796595892)
('include', 288326.0385743985)
('clinical', 284798.24958029256)
('student', 273517.20409035846)
('develop', 265742.21541210474)

Topic 1:
('health', 571990.2636541908)
('intervention', 314393.81538036565)
('program', 276429.2213613038)
('training', 270500.1748114237)
('patient', 261967.87126937282)
('care', 257389.0147080161)
('treatment', 256916.51116136395)
('use', 249882.1017584784)
('risk', 234978.40351242007)
('child', 222150.8447651066)

Topic 2:
('develop', 278615.33372304833)
('datum', 271970.30330009817)
('genetic', 241175.77406328474)
('disease', 239226.05625092413)
('gene', 232663.10147768827)
('model', 220212.59278326805)
('identify', 209517.31589152484)
('method', 207926.7360375005)
('risk', 199421.89608543157)
('analysis', 199068.17085353844)

Topic 3:
('protein', 619160.2913727224)
(

#### NMF

In [6]:
# create document-term matrix - TFIDF 

stop_wds = ['research', 'aim', 'project']  # study will be eliminated by max_df

tfidf_vectorizer = TfidfVectorizer(max_df=0.6, min_df=20, lowercase=False, stop_words=stop_wds)
tf_idf = tfidf_vectorizer.fit_transform(text)

In [7]:
# create model
num_topics = 5

t1 = time.time()
nmf_model = NMF(n_components=num_topics, random_state = 0)
doc_topic = nmf_model.fit_transform(tf_idf)
t2 = time.time()
print(f"  Model time: {t2-t1}")

topic_term = nmf_model.components_              

  Model time: 181.36343955993652


In [8]:
print_topics(topic_term, tfidf_vectorizer, 10)


Topic 0:
('cell', 5.804278653231487)
('protein', 2.5874489185198324)
('gene', 2.1913886285816706)
('mouse', 1.687654517628736)
('mechanism', 1.5966446198216455)
('signal', 1.5643679493998646)
('function', 1.5074694048657473)
('disease', 1.4439077372461109)
('expression', 1.4242018217428354)
('human', 1.4057081856526463)

Topic 1:
('program', 3.4646841659512098)
('student', 3.097402131750013)
('training', 3.0177176689205143)
('trainee', 1.890557249680917)
('faculty', 1.5434548481104549)
('science', 1.5107758361476762)
('career', 1.2774962029030215)
('graduate', 1.1258170254988398)
('mentor', 1.0529789457082235)
('university', 1.0053071972181338)

Topic 2:
('health', 1.499791620712075)
('hiv', 1.4146585128862277)
('intervention', 1.3354337696275536)
('patient', 1.2966490708641691)
('risk', 1.2830805511885388)
('care', 1.1159674224691325)
('datum', 1.1119143353819354)
('child', 1.062983138961337)
('treatment', 1.0305846741790006)
('use', 0.9386092805913568)

Topic 3:
('cancer', 6.8571033

In [11]:
list(range(5,51,5))

[5, 10, 15, 20, 25, 30, 35, 40, 45, 50]

#### View Results

In [3]:
f = open('results/NMF/nmf_t0.pkl', 'rb')
nmf_t = pickle.load(f)
f.close()

In [4]:
nmf_t

Unnamed: 0,iteration 0
5,153.353329
10,283.245558
15,640.015485
20,1121.548131
25,1449.234355
30,1743.863659
35,1747.789454
40,2026.647499
45,3113.066131
50,2424.188318


In [7]:
nmf_t["iteration 0"].sum()/60/60

4.084153310987684

In [8]:
f = open('results/NMF/nmf_topics0.pkl', 'rb')
nmf_top = pickle.load(f)
f.close()

In [9]:
nmf_top

Unnamed: 0,iteration 0
5,"[[cell, protein, gene, mouse, mechanism, signa..."
10,"[[cell, immune, tumor, stem, response, mouse, ..."
15,"[[cell, stem, tumor, differentiation, tissue, ..."
20,"[[cell, stem, differentiation, tissue, signal,..."
25,"[[cell, stem, differentiation, signal, tissue,..."
30,"[[cell, stem, differentiation, tissue, progeni..."
35,"[[cell, stem, differentiation, tissue, progeni..."
40,"[[cell, stem, differentiation, tissue, progeni..."
45,"[[cell, stem, differentiation, tissue, progeni..."
50,"[[cell, antigen, type, cd4, differentiation, c..."


In [15]:
nmf_top["iteration 0"][5]

[['cell',
  'protein',
  'gene',
  'mouse',
  'mechanism',
  'signal',
  'function',
  'disease',
  'expression',
  'human'],
 ['program',
  'student',
  'training',
  'trainee',
  'faculty',
  'science',
  'career',
  'graduate',
  'mentor',
  'university'],
 ['health',
  'hiv',
  'intervention',
  'patient',
  'risk',
  'care',
  'datum',
  'child',
  'treatment',
  'use'],
 ['cancer',
  'tumor',
  'breast',
  'prostate',
  'patient',
  'clinical',
  'therapy',
  'treatment',
  'lung',
  'metastasis'],
 ['core',
  'administrative',
  'center',
  'investigator',
  'support',
  'provide',
  'service',
  'resource',
  'analysis',
  'datum']]

In [16]:
f = open('results/LDA/lda_t0.pkl', 'rb')
lda_t = pickle.load(f)
f.close()

In [17]:
lda_t

Unnamed: 0,iteration 0
5,752.539494
10,980.929357
15,1010.966805
20,1058.943146
25,1149.526965
30,1167.401733
35,1208.69609
40,1290.929642
45,1337.202181
50,1404.108259


In [20]:
lda_t["iteration 0"].sum()/60/60

3.1559010200368034

In [21]:
f = open('results/LDA/lda_p0.pkl', 'rb')
lda_p = pickle.load(f)
f.close()

In [22]:
lda_p

Unnamed: 0,iteration 0
5,2771.422337
10,2402.291834
15,2237.178713
20,2151.058233
25,2076.74459
30,2031.060075
35,1986.262786
40,1973.309246
45,1928.947377
50,1912.521474


In [23]:
f = open('results/LDA/lda_topics0.pkl', 'rb')
lda_top = pickle.load(f)
f.close()

In [24]:
lda_top

Unnamed: 0,iteration 0
5,"[[program, core, provide, support, new, center..."
10,"[[program, training, core, provide, support, c..."
15,"[[core, provide, center, support, clinical, in..."
20,"[[mouse, insulin, metabolic, increase, metabol..."
25,"[[student, program, science, training, graduat..."
30,"[[care, health, patient, improve, quality, bas..."
35,"[[dr, clinical, career, development, training,..."
40,"[[signal, cell, pathway, role, regulate, mecha..."
45,"[[imaging, image, tissue, technique, resolutio..."
50,"[[host, infection, pathogen, disease, bacteria..."


In [29]:
lda_top["iteration 0"][50]

[['host',
  'infection',
  'pathogen',
  'disease',
  'bacterial',
  'human',
  'specie',
  'bacteria',
  'resistance',
  'strain'],
 ['mouse',
  'model',
  'animal',
  'human',
  'core',
  'gene',
  'provide',
  'line',
  'transgenic',
  'generate'],
 ['liver',
  'cell',
  'signal',
  'mouse',
  'prostate',
  'bone',
  'pathway',
  'role',
  'intestinal',
  'growth'],
 ['training',
  'program',
  'dr',
  'career',
  'clinical',
  'mentor',
  'trainee',
  'development',
  'year',
  'investigator'],
 ['inflammatory',
  'inflammation',
  'injury',
  'induce',
  'macrophage',
  'activation',
  'mouse',
  'response',
  'model',
  'mechanism'],
 ['gene',
  'genetic',
  'identify',
  'disease',
  'mutation',
  'genome',
  'variant',
  'genomic',
  'phenotype',
  'analysis'],
 ['synaptic',
  'mechanism',
  'regulate',
  'function',
  'axon',
  'protein',
  'plasticity',
  'role',
  'migration',
  'molecular'],
 ['vaccine',
  'antibody',
  'virus',
  'human',
  'response',
  'infection',
  'an

#### Coherence

In [2]:
# data needed for coherence calculation

# import entire dataset
f = open('../../../data/prd/Paper/coherence_vars.sav', 'rb')
[id2word, docs] = pickle.load(f)
f.close()

# import topics

df_topics = pd.read_pickle("./results/NMF/nmf_topics0.pkl")
nrow, ncol = df_topics.shape


print("data ingested--------------------------", flush = True)

data ingested--------------------------


In [None]:
# corpus - word frequency in docs - not needed for coherence calculation
# id2word - dictionary
# docs - df["final_tokens"]

# calculate coherence

n_topics = list(range(5,51,5))
batch = 0

col_names = [f"iteration {i+batch}" for i in range(ncol)]
co_val = pd.DataFrame(index = n_topics, columns = col_names)
co_t = pd.DataFrame(index = n_topics, columns = col_names)

for j in range(ncol):
    
    print(f'Iteration {j}', flush = True)
    
    coherence_values = []
    coherence_time = []
    
    for i in range(nrow): 
            
        # calculate coherence
        t1 = time.time()
        cm = CoherenceModel(topics=df_topics.iloc[i,j], dictionary=id2word, texts=docs, coherence='c_v',
                            processes=10) 
        coherence_values.append(cm.get_coherence())
        t2 = time.time()
        coherence_time.append(t2-t1)
        print(f"  Coherence time: {t2-t1}", flush=True)
        
        # output completion message
        print('Number of topics =', df_topics.index[i], "complete.", flush = True)    
    
    # save results
    co_val[f"iteration {j+batch}"] = coherence_values
    co_t[f"iteration {j+batch}"] = coherence_time
    
       
        
# save results 

#co_val.to_pickle("./results/NMF/co_nmf_val0.pkl")
#co_t.to_pickle("./results/NMF/co_nmf_t0.pkl")


Iteration 0
  Coherence time: 168.43822956085205
Number of topics = 5 complete.


In [3]:
t1 = time.time()
cm = CoherenceModel(topics=df_topics.iloc[9,0], dictionary=id2word, texts=docs, coherence='c_v',
                    processes=15) 
print(cm.get_coherence())
t2 = time.time()
print(f"  Coherence time: {t2-t1}", flush=True)

0.7165704142062844
  Coherence time: 472.81356859207153


In [4]:
5000/60

83.33333333333333

In [3]:
f = open('results/NMF/co_nmf_t0.pkl', 'rb')
co_nmf_t = pickle.load(f)
f.close()

In [4]:
co_nmf_t

Unnamed: 0,iteration 0
5,134.90708
10,177.580023
15,207.161786
20,256.632632
25,310.156532
30,344.809261
35,393.104356
40,446.517021
45,475.443856
50,544.816709


In [7]:
co_nmf_t["iteration 0"].sum()/60

54.852154246966045

In [8]:
f = open('results/NMF/co_nmf_val0.pkl', 'rb')
co_nmf_val = pickle.load(f)
f.close()

In [9]:
co_nmf_val

Unnamed: 0,iteration 0
5,0.681432
10,0.69306
15,0.698364
20,0.692956
25,0.703036
30,0.712049
35,0.714464
40,0.721716
45,0.715991
50,0.71657
