# Filtering the AI dataset using the Eads et al. method

#### Author: Haleigh Tomlin
#### Date: 07/07/2021

Importing our data:

In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
#from git/dspg21RnD/wheat_filtration/wheat_filtration import keywords
#from git/dspg21RnD/wheat_filtration/wheat_filtration import filter
#import keywords

In [2]:
#larger dataset

df = pd.read_pickle("../../data/dspg21RnD/smaller-final-dataset.pkl")

Steps:

1. create short, intuitive keyword list encapsulating concept
    - is this the keyword list that we have from the OECD paper?
2. Infer an LDA topic model on the full text corpus
    (Kathryn sent this code in our student chat)
      will not use the keyword list
3. Identify LDA topics relevant to concept using the keywords identified in step 1 and domain expertise

Multi-part Filtering methods:


* topic proportion
* relative entropy
* super keywords


Keyword list:

Source: OECD Selected list of AI key terms (Cierra hand-entered these)

In [39]:
#start with the core terms from the OECD paper
core_terms = ["adaboost","artificial intelligence","artificial neural network","back propagation"
,"back propagation neural network","computational intelligence","computer vision"
,"convolutional neural network","deep belief network","deep convolutional neural network"
,"deep learn","deep neural network","elman network","elman neural network"
,"expert system","fee forward neural network","inference engine","machine intelligence"
,"machine learn","machine translation","machine vision","multilayer neural network"
,"natural language process","perceptron","random forest","rbf neural network","recurrent neural network"
,"self organize map","spike neural network","supervise learn","support vector machine"
,"svm classifier","unsupervised learn","artificial_intelligence","artificial_neural_network","back_propagation"
,"back_propagation_neural_network","computational_intelligence","computer_vision"
,"convolutional_neural_network","deep_belief_network","deep_convolutional_neural_network"
,"deep_learn","deep_neural_network","elman_network","elman_neural_network"
,"expert_system","fee_forward_neural_network","inference_engine","machine_intelligence"
,"machine_learn","machine_translation","machine_vision","multilayer_neural_network"
,"natural_language_process","random_forest","rbf_neural_network","recurrent_neural_network"
,"self_organize_map","spike_neural_network","supervise_learn","support_vector_machine"
,"svm_classifier","unsupervised_learn", "machine_learning"]

In [4]:
noncore_terms = ["actor critic", "analog vlsi", "associative memory", "autonomous vehicle", "bayes classifer",
"bayesian belief network", "bioinformatics", "camera calibration", "collaborative system",
"commonsense reason", "computational biology", "datum mine", "decision tree", "description logic",
"dimensionality reduction", "discriminant analysis", "fuzzy logic", "gene ontology", "hide markov model",
"humanoid", "image alignment", "image match", "information retrieval", "kegg pathway", "knowledge base", 
"knowledge representation and reason", "linear discriminant", "markov decision process",
"mulitclass classification", "naive bayes", "name entity recognition", "near neighbor classifier", 
"neural network", "neuro fuzzy", "neuromorphic compute", "neuromorphic hardware", "non rigid registration", 
"nonmonotonic reason", "object recognition", "opinion mine", "optimal search", "pattern analysis", 
"pattern recognition", "person re identification", "principal component analysis", "question answer", "radial basis function",
"rbf kernel", "reinforcement learn", "rigid registration", "robot", "sarsa", "sensor datum fusion", 
"sensor network", "speech recognition", "stereo match", "symbolic reason", "system and control theory",
"template match", "text categorization", "text mine", "text summarization", "word sense disambiguation",
"actor_critic", "analog_vlsi", "associative_memory", "autonomous_vehicle", "bayes_classifer",
"bayesian_belief_network", "camera_calibration", "collaborative_system",
"commonsense_reason", "computational_biology", "datum_mine", "decision_tree", "description_logic",
"dimensionality_reduction", "discriminant_analysis", "fuzzy_logic", "gene_ontology", "hide_markov_model",
"image_alignment", "image_match", "information_retrieval", "kegg_pathway", "knowledge_base", 
"knowledge_representation_and_reason", "linear_discriminant", "markov_decision_process",
"mulitclass_classification", "naive_bayes", "name_entity_recognition", "near_neighbor_classifier", 
"neural_network", "neuro_fuzzy", "neuromorphic_compute", "neuromorphic_hardware", "nonrigid_registration", 
"nonmonotonic_reason", "object_recognition", "opinion_mine", "optimal_search", "pattern_analysis", 
"pattern_recognition", "person_reidentification", "principal_component_analysis", "question_answer", "radial_basis_function",
"rbf_kernel", "reinforcement_learn", "rigid_registration", "sensor_datum_fusion", 
"sensor_network", "speech_recognition", "stereo_match", "symbolic_reason", "system_and_control_theory",
"template_match", "text_categorization", "text_mine", "text_summarization", "word_sense_disambiguation"]

In [5]:
df.head()

Unnamed: 0,original index,PROJECT_ID,ABSTRACT,FY,ORG_COUNT,PI_COUNT,nchar,final_frqwds_removed,PROJECT_TERMS,PROJECT_TITLE,...,ORGANIZATION_CITY,ORGANIZATION_STATE,ORGANIZATION_ZIP,ORGANIZATION_COUNTRY,BUDGET_START_DATE,BUDGET_END_DATE,CFDA_CODE,FY.y,FY_TOTAL_COST,FY_TOTAL_COST_SUB_PROJECTS
0,17608,152242,The multiprotein complex y-secretase proteolyt...,2008,1,1,1402,"[multiprotein, y_secretase, proteolytically_cl...",Active Sites; Affect; Alzheimer's Disease; Am...,STRUCTURE OF SIGNAL PEPTIDE PEPTIDASE,...,BOSTON,MA,21156110,UNITED STATES,12/1/2007,1/1/2008,93.866,2008,3483.0,
1,111864,190316,DESCRIPTION (provided by applicant): The Kis...,2008,1,1,2553,"[kissl, gene, encode, peptide, kisspeptin, bin...",Affect; Animal Model; Axon; Behavior; Behavio...,ROLE OF KISS1 NEURONS IN THE SEASONAL AND CIRC...,...,SEATTLE,WA,981959472,UNITED STATES,9/1/2008,1/1/2009,93.865,2008,39175.0,
2,22052,154213,DESCRIPTION (provided by applicant): The objec...,2008,1,1,1414,"[biophysical, basis, thermodynamics_kinetic, m...",Agreement; Antibodies; base; Binding; Biochem...,CARBONIC ANHYDRASE AS A MODEL TO UNDERSTAND DI...,...,CAMBRIDGE,MA,21385319,UNITED STATES,1/2/2008,1/1/2009,93.859,2008,49646.0,
3,35004,159362,Obesity is the cause of many adverse pregnancy...,2008,1,1,1545,"[obesity, adverse_pregnancyoutcome, great, hea...",African; Analysis of Variance; Asians; Birth;...,OBESITY ON VAGAL TONE AND HBA1C DURING PREGNANCY,...,HOUSTON,TX,770305400,UNITED STATES,4/1/2008,1/1/2009,93.361,2008,20406.0,
4,371628,594482,Local potato advisory groups have expressed in...,2010,1,1,271,"[local, potato, advisory, express, interest, m...",cost; Health; interest; Manure; Parasitic nem...,PLANT-PARASITIC NEMATODE MANAGEMENT AS A COMPO...,...,CORVALLIS,OR,97331,UNITED STATES,,,10.203,2010,,


In [6]:
df.shape

(690814, 31)

Code for LDA:

In [6]:
# input needed for doc-term matrix creation in Scikit-Learn is one string per document (not a list of strings).  
# Original data 'ABSTRACT' is already in this form, but not the tokens in "final_frqwds_removed"

#Scikit-Learn package

tokens = df["final_frqwds_removed"]
#our cleaned text, the final column in the df

 

text = [] # text will contain the processed tokens in string form (1 string per abstract)

 

for abstract in tokens:
    text.append(" ".join(abstract))
    
text = pd.Series(text)

In [7]:
# TRY TOPIC MODELING WITH LDA


# create document-term matrix
#row = doc, col = term


lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)
#this is our way to filter out words that don't appear enough, and those that appear way too often (we want the middle set of terms)
#^this filters the size of our matrix
lda_dtm = lda_vectorizer.fit_transform(text)
#text = our abstract text, right in the dataframe
#fits our doc-term matrix to our specific text 
#this is the standard for scikit-learn
 


In [50]:
#functions for creating a topic dictionary, viewing the topics in the topic model,
#and selecting only the relevant topics based on a threshold and our keyword list.


def topic_dictionary(lda_model, lda_vectorizer, top_n=10):
    topic_ls = {} #append keys, append the values
    
    
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row

        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        topic_ls[idx] = print_list

    return topic_ls

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
            
def relevant_topics(topic_dictionary, keyword_list, threshold = 0):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i][0] in keyword_list:
                relevant_words += 1
            else: relevant_words += 0
        if (relevant_words) >= threshold :#/len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic  

def super_keywords(keyword_list, relevant_topic_list, topic_dictionary):
    superkeyword = keyword_list
    for key in topic_dictionary:
        if key in relevant_topic_list:
            for i in range(len(topic_dictionary[key])):
                superkeyword.append(topic_dictionary[key][i][0])
    return superkeyword

75 topics: Topic 10, 24, and 51 are possibly AI-related.  Listed the top 50 terms in each of these:

Topic 10:
('network', 53825.25965640873)
('technology', 42728.55547085485)
('algorithm', 37412.7652990366)
('software', 33573.40142077985)
('computational', 29964.848970416715)
('performance', 27450.122213215556)
('enable', 26592.30362801974)
('computer', 25201.91159619608)
('user', 23632.889066360924)
('device', 22715.543741103054)
('power', 20878.438616511026)
('modeling', 19747.06535829457)
('scale', 19051.82984926849)
('need', 18819.616576225868)
('integrate', 18650.717498136833)
('cost', 18567.237225468838)
('framework', 18251.508165255116)
('real', 18249.528870779923)
('challenge', 18134.448252316815)
('simulation', 17041.644818447272)
('environment', 16240.871533757234)
('exist', 16038.794328887796)
('set', 15421.020135240813)
('computing', 15070.883550563334)
('level', 14897.860551756186)
('create', 14573.33274679897)
('platform', 14263.175634786761)
('sensor', 13665.999482581969)
('solution', 13310.78151598151)
('communication', 13278.32803446321)
('human', 13253.981375737998)
('decision', 13237.983724861813)
('efficient', 13029.84755685139)
('source', 12589.965769866973)
('task', 12273.378325544256)
('advance', 12055.660582027756)
('statistical', 11944.76354684112)
('market', 11717.898271023516)
('build', 11478.642123837612)
('apply', 11370.13332797389)
('dynamic', 10970.32761342934)
('security', 10912.645807334626)
('infrastructure', 10906.617798904113)
('optimization', 10804.823691738788)
('energy', 10626.635308994704)
('way', 10357.627150961904)
('benefit', 10236.572994696628)
('processing', 10213.910202150646)
('available', 10176.861809304963)
('wireless', 9916.229416561884)
('mobile', 9615.18914315979)
('engineering', 9525.180031780988)
('feature', 9418.82844443452)
('error', 9359.45014147321)
('architecture', 9277.70068768171)
('code', 9264.956910485882)
('analyze', 9249.111570286308)
('open', 9230.833866482713)
('product', 9205.74678303653)
('distribute', 9097.238076993952)
('world', 9093.48116232599)
('prediction', 9063.476220248975)
('eg', 9012.825320053184)
('methodology', 8921.995092193196)
('test', 8913.823394859435)
('broader', 8903.714955399279)
('range', 8767.910649090174)
('capability', 8685.93093915866)
('industry', 8669.94192053707)
('team', 8627.45237194777)
('commercial', 8552.466253837483)
('domain', 8531.507470209368)
('science', 8374.659596131967)
('interface', 8334.313894453508)
('achieve', 8209.895381295437)



Topic 24:
('database', 27274.154279965936)
('access', 15940.050674815675)
('web', 14077.035202258385)
('user', 12599.923198898396)
('network', 9928.14861537376)
('site', 9923.85639170257)
('collection', 9654.355603563314)
('available', 9055.735100428223)
('software', 8081.864375711944)
('national', 7652.611953390629)
('repository', 7652.091276866907)
('clinical', 7550.423760987247)
('consortium', 7145.7404048700355)
('standard', 6923.379043559533)
('maintain', 6584.627469150696)
('nih', 6534.977880317629)
('need', 6330.009320615587)
('management', 6237.382031205433)
('share', 6171.5516268863785)
('create', 6079.5436263709835)
('report', 5740.988796931178)
('protocol', 5704.881759273562)
('contract', 5689.117670939465)
('informatic', 5442.046494503049)
('set', 5272.5595884109125)
('collaboration', 5157.491583155765)
('website', 5002.040010990332)
('infrastructure', 4993.334813825688)
('facilitate', 4829.821457464727)
('scientific', 4786.805670454696)
('source', 4726.32456459939)
('publication', 4717.698413933397)
('technology', 4675.668047003619)
('biomedical', 4557.079343812579)
('record', 4488.129718892659)
('public', 4479.972569543117)
('document', 4455.898603942592)
('implement', 4358.988416721788)
('collect', 4273.873165662352)
('cc', 4219.801489814593)
('file', 4053.342429939934)
('library', 3939.378953082375)
('electronic', 3895.045429622484)
('staff', 3736.6845977567273)
('nci', 3643.4810544028264)
('update', 3597.6433072335053)
('dataset', 3563.933703037277)
('search', 3534.4334422974252)
('unit', 3513.2188838044153)
('review', 3499.483621265816)

Topic 51:
('computational', 51745.08463804256)
('science', 32585.98798607243)
('engineering', 27205.324539063437)
('modeling', 26067.86361049245)
('computer', 25032.616886840162)
('student', 25010.09947731923)
('simulation', 19372.767443288685)
('integrate', 17844.566808591368)
('software', 16737.635214827085)
('experimental', 16642.95628670851)
('biological', 15873.905133726039)
('enable', 14778.494974744777)
('scientific', 14694.489168602857)
('computing', 14630.593347093729)
('biology', 14223.489313183693)
('scale', 13658.786004682379)
('technology', 12914.708630163028)
('undergraduate', 12338.934759835296)
('module', 11906.019110412388)
('advance', 11543.68693646285)
('course', 10489.910292125653)
('scientist', 10341.2137453903)
('physic', 10201.825924184639)
('interdisciplinary', 10177.268202162646)
('graduate', 10128.190282291309)
('pi', 9888.73261828072)
('algorithm', 9690.472535018882)
('education', 9462.012130288134)
('create', 8894.025759209459)
('interaction', 8697.26038222569)
('award', 8553.685566325188)
('fundamental', 8365.59858803463)
('level', 8305.875411518651)
('challenge', 8150.591897409828)
('team', 8098.603232667013)
('outreach', 7989.73590364011)
('advanced', 7966.464188646906)
('broad', 7869.385115261715)
('collaboration', 7693.356426670414)
('apply', 7688.742317350687)
('educational', 7625.600463915728)
('range', 7564.062977048656)
('environment', 7279.150555990577)
('open', 7155.46577500249)
('build', 7117.860555180971)
('visualization', 7069.386587381167)
('physical', 6996.259969300766)
('discovery', 6802.078831981655)
('framework', 6755.569842665634)
('network', 6741.766992135813)


With 100, I picked out only two topics.  These seem less relevant in a sense

Topic 10:
('network', 53825.25965640873)
('technology', 42728.55547085485)
('algorithm', 37412.7652990366)
('software', 33573.40142077985)
('computational', 29964.848970416715)
('performance', 27450.122213215556)
('enable', 26592.30362801974)
('computer', 25201.91159619608)
('user', 23632.889066360924)
('device', 22715.543741103054)
('power', 20878.438616511026)
('modeling', 19747.06535829457)
('scale', 19051.82984926849)
('need', 18819.616576225868)
('integrate', 18650.717498136833)
('cost', 18567.237225468838)
('framework', 18251.508165255116)
('real', 18249.528870779923)
('challenge', 18134.448252316815)
('simulation', 17041.644818447272)
('environment', 16240.871533757234)
('exist', 16038.794328887796)
('set', 15421.020135240813)
('computing', 15070.883550563334)
('level', 14897.860551756186)
('create', 14573.33274679897)
('platform', 14263.175634786761)
('sensor', 13665.999482581969)
('solution', 13310.78151598151)
('communication', 13278.32803446321)
('human', 13253.981375737998)
('decision', 13237.983724861813)
('efficient', 13029.84755685139)
('source', 12589.965769866973)
('task', 12273.378325544256)
('advance', 12055.660582027756)
('statistical', 11944.76354684112)
('market', 11717.898271023516)
('build', 11478.642123837612)
('apply', 11370.13332797389)
('dynamic', 10970.32761342934)
('security', 10912.645807334626)
('infrastructure', 10906.617798904113)
('optimization', 10804.823691738788)
('energy', 10626.635308994704)
('way', 10357.627150961904)
('benefit', 10236.572994696628)
('processing', 10213.910202150646)
('available', 10176.861809304963)
('wireless', 9916.229416561884)

Topic 24:
('database', 24986.78413608181)
('access', 12008.609682644008)
('collection', 9722.288186695594)
('web', 9580.36291265811)
('user', 8601.558515048888)
('clinical', 7187.879954026213)
('sci', 7153.484498222936)
('available', 6953.14552060346)
('repository', 6567.306807796877)
('software', 6373.446374422193)
('contract', 6312.998231590412)
('national', 5533.251701258085)
('biomedical', 5222.594894934696)
('standard', 4992.114638790753)
('site', 4901.321929459208)
('record', 4653.421492286103)
('maintain', 4623.886647756022)
('need', 4326.967043724859)
('share', 4289.987339413143)
('nih', 4124.822383921171)
('collect', 4099.012768476088)
('create', 4074.226847730064)
('technology', 3924.0365734950565)
('public', 3849.251828126874)
('source', 3722.529665217681)
('facilitate', 3687.804237218813)
('collaboration', 3628.3997895034795)
('library', 3625.635523084248)
('cc', 3605.1873197957384)
('file', 3593.8525559975833)
('document', 3582.074611169476)
('scientific', 3576.3491505355382)
('product', 3483.845362251037)
('publication', 3478.70316109251)
('management', 3441.6165034572655)
('informatic', 3406.33415889893)
('testing', 3402.071382981103)
('rehabilitation', 3400.95914991887)
('website', 3320.0999207130862)
('store', 3273.279914664719)
('set', 3192.871719850713)
('report', 3170.2469467472874)
('contractor', 3161.6551997351585)
('electronic', 3150.027428625577)
('member', 3127.06633941713)
('dataset', 3040.7978829728977)
('storage', 3030.524215761676)
('archive', 2998.5148922637118)
('interface', 2843.7100084418275)
('protocol', 2824.989589617557)


Now, trying with 150

In [18]:
# create model

num_topics = 150
lda_model_150 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist_150 = lda_model_150.fit_transform(lda_dtm)
top_term_dist_150 = lda_model_150.components_


In [24]:
dic150 = topic_dictionary(lda_model_150, lda_vectorizer, 64)

200:

In [11]:
num_topics = 200
lda_model_200 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist_200 = lda_model_200.fit_transform(lda_dtm)
top_term_dist_200 = lda_model_200.components_


In [12]:
dic200 = topic_dictionary(lda_model_200, lda_vectorizer, 64)

In [15]:
df200 = pd.DataFrame.from_dict(dic200, orient = 'index')

In [16]:
df200.to_pickle("./dic200.pkl")

250:

In [14]:
num_topics = 250
lda_model_250 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist_250 = lda_model_250.fit_transform(lda_dtm)
top_term_dist_250 = lda_model_250.components_


In [31]:
dic250 = topic_dictionary(lda_model_250, lda_vectorizer, 64)

In [32]:
#df250 = pd.DataFrame.from_dict(dic250, orient = 'index')

In [33]:
#df250.to_pickle("./dic250.pkl")

300:

In [None]:
num_topics = 300
lda_model_300 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist_300 = lda_model_300.fit_transform(lda_dtm)
top_term_dist_300 = lda_model_300.components_


In [11]:
dic300 = topic_dictionary(lda_model_300, lda_vectorizer, 64)

In [None]:
#print_topics(lda_model_300, lda_vectorizer, 10)

When using k = 300 topics, I picked out topic 152, 171, 216, and 264 as potentially AI-related topics.

In [3]:
df300 = pd.read_pickle("dic300.pkl")

In [4]:
df300.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54,55,56,57,58,59,60,61,62,63
0,"(college, 14257.433245254824)","(mexico, 5403.941036277713)","(case, 5187.418995572763)","(cl, 3677.2807010985935)","(university, 3281.9038700850497)","(native_american, 3273.027006279184)","(student, 2371.9498093531574)","(tribal, 2283.496322675257)","(inbre, 2256.4129382640263)","(aas, 1984.4442977167105)",...,"(wyoming, 215.27592676368357)","(indian, 213.4770234103525)","(largest, 211.71006798706344)","(welding, 211.59253138261735)","(osctr, 211.0999181429873)","(north_dakota, 210.91886640337623)","(institution, 208.91425510813934)","(oral_cleft, 195.88415668379315)","(colorado, 195.74341086636153)","(england, 195.35572985036544)"
1,"(min, 2861.692178936905)","(ml, 2200.2531340003557)","(collection, 1467.16217610542)","(psd, 1234.787993756266)","(needle, 1190.899349447796)","(dry, 1086.7414717792715)","(fibroid, 1073.09988661631)","(uwccc, 993.9316667031538)","(promis, 975.0505479220886)","(uf, 949.8617452640069)",...,"(contain, 184.19060048297226)","(nitrogen, 178.60083159443744)","(serve, 172.43664251435817)","(centrifuge, 169.92544525828342)","(cna, 165.91175722818016)","(possible, 163.84175091465536)","(overnight, 163.68679484467168)","(place, 162.048178177806)","(solution, 161.40078868354996)","(hysterectomy, 159.21195458577716)"
2,"(ci, 4198.738474260232)","(chromosome, 4117.343314610552)","(mdsc, 2931.1999305612685)","(pair, 2530.8473150143186)","(meiosis, 2485.3656136516165)","(ald, 2347.0998421245295)","(meiotic, 2258.9552774034673)","(cd36, 1251.0653856290578)","(aneuploidy, 1218.072741442271)","(etoh, 914.4680360829552)",...,"(birth_defect, 211.74950983555289)","(cpf, 206.98922637587435)","(topo_ii, 205.66114454216975)","(il8, 203.57504267412975)","(suppressor, 199.93742185504587)","(ensure, 198.5284171896345)","(d_j_recombination, 198.42845048884047)","(level, 193.22494161966898)","(aldh, 192.12765892530868)","(rala, 189.84308803519093)"
3,"(bladder, 9331.55122064817)","(catheter, 3614.149481865032)","(anesthesia, 1835.3853477099772)","(mit, 1825.671204466944)","(urinary, 1788.2825023371713)","(hai, 913.8001599220423)","(urinary_tract, 874.4935209887101)","(ref, 870.3631263824083)","(urologic, 794.7271082570174)","(sponge, 752.811639673635)",...,"(ebl, 140.38477922399912)","(hpp, 135.04766918080347)","(urinary_catheter, 132.0999302955195)","(dnm1, 132.099654192364)","(tmf, 129.09987100625077)","(anesthesiologist, 128.05576934203424)","(permethrin, 123.5323599644022)","(million, 121.83032336076012)","(oad, 115.09981356329311)","(overactive_bladder, 114.05871329208954)"
4,"(drug, 119630.27530551182)","(efficacy, 23770.480464471737)","(clinical, 21298.23511249266)","(agent, 20490.828620518547)","(therapeutic, 20159.11015803471)","(preclinical, 13737.465056946607)","(human, 13056.063564451428)","(therapy, 10316.451182769813)","(candidate, 9165.246970649845)","(need, 8626.009870543849)",...,"(molecule, 2661.9303308093436)","(pk, 2615.8932752924024)","(proof_concept, 2405.7512603497607)","(represent, 2370.0296326317425)","(public, 2333.6522286915633)","(clinically, 2324.401885374857)","(platform, 2263.5837671821655)","(potent, 2254.9600528593273)","(unmet, 2243.748645082756)","(pharmacological, 2199.5681924902515)"


In [11]:
df300.loc[152][0:50]

0         (computing, 18927.409227124044)
1          (computer, 17501.801783403396)
2       (performance, 13247.105234782885)
3          (software, 12326.823870465189)
4            (enable, 11991.590493335996)
5           (platform, 9691.161696298339)
6      (computational, 8675.980873611225)
7     (infrastructure, 8533.830298083809)
8       (architecture, 7947.116473424327)
9              (scale, 7912.625835948814)
10       (technology, 7231.8468296130595)
11        (simulation, 6784.426785300783)
12              (user, 6278.770719807697)
13            (science, 6244.02935074988)
14     (visualization, 6171.368913908819)
15         (challenge, 5839.640475771118)
16               (big, 5830.777057458608)
17              (need, 5692.184665722128)
18              (open, 5671.467135397384)
19         (algorithm, 5624.321379536211)
20          (hardware, 5596.394139310277)
21           (virtual, 5540.012770839843)
22       (computation, 5461.889261834698)
23        (distribute, 5250.681132

In [12]:
df300.loc[171][0:50]

0       (computational, 32377.793827063593)
1            (modeling, 21161.874319051214)
2           (framework, 16333.547533952897)
3               (human, 12892.648835820853)
4            (integrate, 12042.30879965073)
5          (interaction, 9388.426880805617)
6                  (set, 8994.491809312716)
7              (dataset, 8814.245612206867)
8                (scale, 8677.675367543574)
9                (apply, 8519.693060737525)
10          (prediction, 8478.482820022733)
11             (predict, 7923.531291842897)
12             (pattern, 7877.448192714734)
13          (biological, 7829.386631404324)
14           (challenge, 7316.546266281028)
15       (experimental, 6985.6412325628735)
16             (network, 6683.333517342877)
17            (algorithm, 6559.06051947889)
18             (advance, 6526.222878687529)
19          (predictive, 5934.597282411869)
20             (feature, 5787.775420321881)
21             (dynamic, 5701.664242260573)
22           (inference, 5367.37

^^ this one has machine learning

relative entropy = more likely to appear in the relevant topics than those that are nonrelevant

In [13]:
df300.loc[216][0:50]

0         (device, 33003.546793449364)
1         (sensor, 23101.196924462758)
2       (technology, 19148.2630727912)
3          (mobile, 10475.62785943063)
4             (real, 9268.92059351035)
5      (monitoring, 8590.403819148374)
6           (sense, 7888.050352262176)
7             (user, 7885.99595469803)
8      (prototype, 6782.6417849763275)
9           (robot, 6366.215960827606)
10    (environment, 6228.133174998938)
11         (enable, 5537.687930997408)
12          (human, 5488.635124870321)
13        (monitor, 5237.700868639142)
14           (need, 4686.433691165775)
15           (home, 4402.902206619098)
16           (cost, 4317.491111605655)
17          (smart, 4271.397776014852)
18      (interface, 3950.154671288099)
19       (robotic, 3826.3187950327383)
20      (platform, 3578.4084414641147)
21        (people, 3531.5576695969976)
22           (low, 3382.5601529269006)
23          (hand, 3312.0616559096134)
24            (ii, 3227.9499390332803)
25    (performance, 2923.

In [14]:
df300.loc[264][0:50]

0             (software, 27843.135403217428)
1             (algorithm, 20821.52797667128)
2                 (code, 10950.011688577157)
3                (search, 9707.233942297735)
4              (database, 6433.333877780078)
5                 (source, 6283.12795384033)
6                   (set, 6078.036434112337)
7            (automated, 5358.3924638808185)
8           (programming, 3944.533441054564)
9                 (exist, 3681.337582531593)
10           (available, 3343.7072969508536)
11       (automatically, 2725.8472664542983)
12         (verification, 2617.640303494259)
13                 (open, 2592.582722943779)
14           (implement, 2585.8483110706584)
15              (analyze, 2549.945020220972)
16              (feature, 2465.717562827795)
17               (apply, 2338.8603854157186)
18             (computer, 2153.849821395666)
19               (create, 2052.843212977921)
20               (error, 2051.9211553352156)
21                (user, 2020.6643593946203)
22        

In [45]:
relevant_topics(dic300, core_terms)

[]

In [13]:
#df300 = pd.DataFrame.from_dict(dic300, orient = 'index')

With 500 topics, because 300 had some that might be related.

In [10]:
num_topics = 500
lda_model_500 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)

doc_top_dist_500 = lda_model_500.fit_transform(lda_dtm)
top_term_dist_500 = lda_model_500.components_


started at 10:23 am finished around 11:16.

In [11]:
dic500 = topic_dictionary(lda_model_500, lda_vectorizer, 64)

In [12]:
df500 = pd.DataFrame.from_dict(dic500, orient = 'index')

In [13]:
df500.to_pickle("./dic500.pkl")

In [27]:
#print_topics(lda_model_500, lda_vectorizer, 10)

I think that topic 340 in the 500-topic model definitely relates the most to AI.  The top 10 terms are:

[('algorithm', 35577.2256234737),
 ('computational', 13757.606248093996),
 ('optimization', 9400.910957512326),
 ('computer', 6438.71942210624),
 ('framework', 6303.723634458724),
 ('efficient', 6024.254793135752),
 ('task', 5838.480489531702),
 ('complexity', 5468.976260050804),
 ('machine_learning', 5261.699439484569),
 ('learn', 5025.700196861855)]

In [35]:
dic500[340][0:10]

[('algorithm', 35577.2256234737),
 ('computational', 13757.606248093996),
 ('optimization', 9400.910957512326),
 ('computer', 6438.71942210624),
 ('framework', 6303.723634458724),
 ('efficient', 6024.254793135752),
 ('task', 5838.480489531702),
 ('complexity', 5468.976260050804),
 ('machine_learning', 5261.699439484569),
 ('learn', 5025.700196861855)]

In [40]:
core_terms

['adaboost',
 'artificial intelligence',
 'artificial neural network',
 'back propagation',
 'back propagation neural network',
 'computational intelligence',
 'computer vision',
 'convolutional neural network',
 'deep belief network',
 'deep convolutional neural network',
 'deep learn',
 'deep neural network',
 'elman network',
 'elman neural network',
 'expert system',
 'fee forward neural network',
 'inference engine',
 'machine intelligence',
 'machine learn',
 'machine translation',
 'machine vision',
 'multilayer neural network',
 'natural language process',
 'perceptron',
 'random forest',
 'rbf neural network',
 'recurrent neural network',
 'self organize map',
 'spike neural network',
 'supervise learn',
 'support vector machine',
 'svm classifier',
 'unsupervised learn',
 'artificial_intelligence',
 'artificial_neural_network',
 'back_propagation',
 'back_propagation_neural_network',
 'computational_intelligence',
 'computer_vision',
 'convolutional_neural_network',
 'deep_be

In [45]:
dic500[340][0][0]

'algorithm'

In [41]:
relevant_topics(dic500, core_terms, 1)

[340]

In [51]:
superkeytest = super_keywords(core_terms, [340], dic500)

In [52]:
superkeytest

['adaboost',
 'artificial intelligence',
 'artificial neural network',
 'back propagation',
 'back propagation neural network',
 'computational intelligence',
 'computer vision',
 'convolutional neural network',
 'deep belief network',
 'deep convolutional neural network',
 'deep learn',
 'deep neural network',
 'elman network',
 'elman neural network',
 'expert system',
 'fee forward neural network',
 'inference engine',
 'machine intelligence',
 'machine learn',
 'machine translation',
 'machine vision',
 'multilayer neural network',
 'natural language process',
 'perceptron',
 'random forest',
 'rbf neural network',
 'recurrent neural network',
 'self organize map',
 'spike neural network',
 'supervise learn',
 'support vector machine',
 'svm classifier',
 'unsupervised learn',
 'artificial_intelligence',
 'artificial_neural_network',
 'back_propagation',
 'back_propagation_neural_network',
 'computational_intelligence',
 'computer_vision',
 'convolutional_neural_network',
 'deep_be

In [62]:
relevant_topics(dic500, superkeytest, 13)

[151, 154, 171, 301, 325, 340, 358]

In [65]:
df500.loc[301]


0        (engineering, 65667.62553237865)
1            (student, 12507.30958364166)
2          (engineer, 11219.630760262753)
3         (technology, 8717.763607962494)
4          (education, 8246.056986812668)
                     ...                 
59        (curriculum, 1700.038789846317)
60    (transformative, 1671.735099095695)
61      (environment, 1631.4516996349776)
62       (capability, 1624.4321728152904)
63           (career, 1621.7869660406914)
Name: 301, Length: 64, dtype: object

In [None]:
df500.loc[151]

In [17]:
#df300.to_pickle("./dic300.pkl")

In [26]:
test = print_topics(lda_model, lda_vectorizer, 5)

In [14]:
#print(test)

In [32]:
print(test[0])

[('student', 298031.21981209115), ('science', 149888.35299074094), ('graduate', 84311.5759278494), ('education', 79857.29822755138), ('school', 77530.94222855203)]


In [33]:
test_rel = relevant_topics(test, core_terms)

In [34]:
test_rel

[]

In [None]:
#relevant_words = 0
#relevant_topic = []
#for each key in dictionary
    #for each word in lists
        #is the word in core_terms?
            #if yes, relevant += 1
    #if (relevant)/top_n >= .02, then relevant_topic.append(key)
#return relevant_topic


In [29]:
def relevant_topic(topic_dictionary, keyword_list, threshold = 0.05):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i] in keyword_list:
                relevant_words += 1
            else: relevant_words += 0
        if (relevant_words)/len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic


In [15]:
#print_topics(lda_model, lda_vectorizer, 50)


In [21]:
# create model

num_topics = 200
#10 to start
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_
#these output gives us the document topic distribution and topic term distributions
#since its LDA we will get probabilities 

#the n_jobs parameter in the LDA function is set to the number of cores you request minus 1.  
#LDA can be run in parallel so it is much faster than NMF 

doc_top_dist and top_term_dist are matrices


In [11]:
topic = 50

In [12]:
# create model

#from sklearn.decomposition import LatentDirichletAllocation

num_topics = 50
#10 to start
lda_model = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=29, random_state = 0)

doc_top_dist = lda_model.fit_transform(lda_dtm)
top_term_dist = lda_model.components_
#these output gives us the document topic distribution and topic term distributions
#since its LDA we will get probabilities 

#the n_jobs parameter in the LDA function is set to the number of cores you request minus 1.  
#LDA can be run in parallel so it is much faster than NMF 

In [13]:
#function to print out top terms in a topic
 

# function slightly modified from https://nlpforhackers.io/topic-modeling/


#model, lda_vectorizer
def print_topics(lda_model, lda_vectorizer, top_n=10):
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
# print_topics(lda_model, lda_vectorizer, 10)

In [14]:
print_topics(lda_model, lda_vectorizer, 50)


Topic 0:
('student', 210620.33172059836)
('science', 73863.07691266914)
('undergraduate', 60883.37299586223)
('graduate', 55919.44242344513)
('engineering', 49251.770315713045)
('education', 39353.54416758851)
('university', 38365.244162673735)
('stem', 36170.273769110936)
('faculty', 32384.786475231438)
('school', 29572.017201481918)
('career', 29391.067233216585)
('college', 29054.105523514027)
('opportunity', 24432.18116275873)
('institution', 23624.61907949045)
('course', 23058.74324353523)
('summer', 18134.308190794243)
('educational', 16692.562993777374)
('outreach', 16400.241909432196)
('scientific', 15000.860943936004)
('technology', 14988.27467695259)
('underrepresented', 14952.499316893005)
('participate', 14548.499719568388)
('professional', 13788.633288272587)
('minority', 13528.572764029717)
('academic', 12902.783938078306)
('national', 12870.190340621859)
('nsf', 11737.287680205916)
('biomedical', 11582.191207149226)
('participation', 11466.731182750022)
('award', 11164.

moving up to 75 topics:

In [16]:
#function to print out top terms in a topic
 

# function slightly modified from https://nlpforhackers.io/topic-modeling/


#model, lda_vectorizer
def print_topics(lda_model, lda_vectorizer, top_n=75):
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
# print_topics(lda_model, lda_vectorizer, 10)

In [17]:
print_topics(lda_model, lda_vectorizer, 75)


Topic 0:
('student', 210620.33172059836)
('science', 73863.07691266914)
('undergraduate', 60883.37299586223)
('graduate', 55919.44242344513)
('engineering', 49251.770315713045)
('education', 39353.54416758851)
('university', 38365.244162673735)
('stem', 36170.273769110936)
('faculty', 32384.786475231438)
('school', 29572.017201481918)
('career', 29391.067233216585)
('college', 29054.105523514027)
('opportunity', 24432.18116275873)
('institution', 23624.61907949045)
('course', 23058.74324353523)
('summer', 18134.308190794243)
('educational', 16692.562993777374)
('outreach', 16400.241909432196)
('scientific', 15000.860943936004)
('technology', 14988.27467695259)
('underrepresented', 14952.499316893005)
('participate', 14548.499719568388)
('professional', 13788.633288272587)
('minority', 13528.572764029717)
('academic', 12902.783938078306)
('national', 12870.190340621859)
('nsf', 11737.287680205916)
('biomedical', 11582.191207149226)
('participation', 11466.731182750022)
('award', 11164.

I think that we should use k=100 like they did in the sociology corpus, which was 14,000 sociology articles. We have like 690,000 rows.  ?? Any thoughts?

Using code from wheat_filtration github:

In [7]:
#filter.py
def keyword_proportion(document, keyword_list):
    """Return percentage of words in the given doc that are present in keyword_list."""
    doc_tokens = document.split()
    num_keywords = sum(
        [1 if word in keyword_list else 0 for word in doc_tokens])
    return float(num_keywords)/len(doc_tokens)

In [8]:
key_prop = keyword_proportion(df["ABSTRACT"], core_terms)

AttributeError: 'Series' object has no attribute 'split'