# Using the LDA topic model on the NSF - cfda = 47.070 on the final tokens column instead of frq words removed


7/28/21

In [1]:
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import numpy as np
import filter

In [2]:
def topic_dictionary(lda_model, lda_vectorizer, top_n = 10):
    topic_ls = {} #append keys, append the values
    
    for idx, topic in enumerate(lda_model.components_):  # loop through each row of H.  idx = row index.  topic = actual row

        print_list = [(lda_vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        topic_ls[idx] = print_list

    return topic_ls

def print_topics(model, vectorizer, top_n=10):
    for idx, topic in enumerate(model.components_):  # loop through each row of H.  idx = row index.  topic = actual row
        print("\nTopic %d:" % (idx))
            
        print_list = [(vectorizer.get_feature_names()[i], topic[i])  
                        for i in topic.argsort()[:-top_n - 1:-1]]
        for item in print_list:
            print(item)
            
            
def relevant_topics(topic_dictionary, keyword_list, threshold = 0.15):
    """returns a list of the topics which contain a threshold % of the
    relevant words in the keyword list"""
    relevant_topic = []
    for key in topic_dictionary:
        relevant_words = 0
        for i in range(len(topic_dictionary[key])):
            if topic_dictionary[key][i][0] in keyword_list:
                relevant_words += 1
        if (relevant_words) / len(topic_dictionary[key]) >= threshold :
            relevant_topic.append(key)
    return relevant_topic  

def rel_ent_key_list(topic_model, vectorizer, n_top_keywords, relevant_topics):
    """Returns a list of the top n keywords based on relative entropy score
     Arguments:
       topic_model (TopicModel): a topic by vocabulary word matrix where each entry
       is the total word count for that word in that topic
       n_top_words (int): the number of keywords the method will return
       relevant_topics (iterable of int)
     Returns:
       keyword_list (iterable of str): list of the top n keywords, sorted
     """
    topic_word_matrix = topic_model.components_
    lda_vectorizer = vectorizer
    
    vocab_logs = np.log(topic_word_matrix.sum(
        axis=0) / topic_word_matrix.sum())

    topic_logs = np.log(topic_word_matrix[relevant_topics, :].sum(
        axis=0) / topic_word_matrix[relevant_topics, :].sum())

    unsorted_props = np.asarray(topic_word_matrix.sum(axis=0) /
                                topic_word_matrix.sum()) * np.asarray(topic_logs - vocab_logs)

    unsorted_props = np.matrix.flatten(unsorted_props)

    sorted_props_and_voc = sorted([(unsorted_props[i], lda_vectorizer.get_feature_names()[i]) for i in list(
        np.argpartition(unsorted_props, len(lda_vectorizer.get_feature_names()) - n_top_keywords))[-n_top_keywords:]], reverse=True)
    ordered_vocab = []
    for (_, voc) in sorted_props_and_voc:
        ordered_vocab.append(voc)
    return ordered_vocab

In [3]:
#start with the core terms from the OECD paper
core_terms = ["adaboost","artificial intelligence","artificial neural network","back propagation"
,"back propagation neural network","computational intelligence","computer vision"
,"convolutional neural network","deep belief network","deep convolutional neural network"
,"deep learn","deep neural network","elman network","elman neural network"
,"expert system","fee forward neural network","inference engine","machine intelligence"
,"machine learn","machine translation","machine vision","multilayer neural network"
,"natural language process","perceptron","random forest","rbf neural network","recurrent neural network"
,"self organize map","spike neural network","supervise learn","support vector machine"
,"svm classifier","unsupervised learn","artificial_intelligence","artificial_neural_network","back_propagation"
,"back_propagation_neural_network","computational_intelligence","computer_vision"
,"convolutional_neural_network","deep_belief_network","deep_convolutional_neural_network"
,"deep_learn","deep_neural_network","elman_network","elman_neural_network"
,"expert_system","fee_forward_neural_network","inference_engine","machine_intelligence"
,"machine_learn","machine_translation","machine_vision","multilayer_neural_network"
,"natural_language_process","random_forest","rbf_neural_network","recurrent_neural_network"
,"self_organize_map","spike_neural_network","supervise_learn","support_vector_machine"
,"svm_classifier","unsupervised_learn", "machine_learning"]

In [7]:
df = pd.read_pickle("../../data/dspg21RnD/smaller-final-dataset2.pkl")
nsf = df[df["AGENCY"] == "NSF"]
# filter where cfda = 47.070

nsf_csci = nsf[nsf["CFDA_CODE"] == "47.070"]

In [8]:
nsf_csci.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16416 entries, 1996 to 689833
Data columns (total 33 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   original index              16416 non-null  int64  
 1   PROJECT_ID                  16416 non-null  object 
 2   ABSTRACT                    16416 non-null  object 
 3   FY                          16416 non-null  object 
 4   ORG_COUNT                   16416 non-null  int64  
 5   PI_COUNT                    16416 non-null  int64  
 6   nchar                       16416 non-null  int64  
 7   stopwds_removed             16416 non-null  object 
 8   final_tokens                16416 non-null  object 
 9   final_frqwds_removed        16416 non-null  object 
 10  PROJECT_TERMS               16416 non-null  object 
 11  PROJECT_TITLE               16416 non-null  object 
 12  DEPARTMENT                  16416 non-null  object 
 13  AGENCY                     

In [9]:
tokens = nsf_csci["final_tokens"]

text = [] # text will contain the processed tokens in string form (1 string per abstract)


for abstract in tokens:
    text.append(" ".join(abstract))
    
text = pd.Series(text)

In [12]:
# TRY TOPIC MODELING WITH LDA

lda_vectorizer = CountVectorizer(max_df=0.6, min_df=20)


lda_dtm2 = lda_vectorizer2.fit_transform(text)


In [14]:
num_topics = 100
finaltokens_100 = LatentDirichletAllocation(n_components=num_topics, doc_topic_prior = 1/num_topics, 
                                      topic_word_prior=0.1, n_jobs=39, random_state = 0)

doc_top_dist_fin = finaltokens_100.fit_transform(lda_dtm2)
top_term_dist_fin = finaltokens_100.components_

In [15]:
finaltokens_dic100 = topic_dictionary(finaltokens_100, lda_vectorizer2, 50)

In [19]:
relevant_topics(finaltokens_dic100, core_terms, 0.02)

[50, 57, 72]

In [17]:
relevant_topics_HT = [] #what I pick out plus the relevant ones that they pick out 

In [20]:
finaltokens_dic100[50][0:10]

[('data', 3799.49587610123),
 ('model', 2712.0832124784124),
 ('machine_learning', 1551.5406775943125),
 ('algorithm', 1496.7178725082042),
 ('learn', 1474.9685760889865),
 ('learning', 1471.1542760051777),
 ('method', 1411.0247584668),
 ('statistical', 1068.5772487038707),
 ('new', 1019.4587428056642),
 ('analysis', 762.4975788704131)]

In [24]:
relevant_topics_HT.append(50)

In [25]:
finaltokens_dic100[57][0:20]

[('problem', 973.8440078186798),
 ('decision', 639.4997822653079),
 ('planning', 635.8923302381274),
 ('plan', 455.1674258944493),
 ('solving', 409.21721669752134),
 ('task', 248.1730783158241),
 ('solve', 238.00025606574465),
 ('constraint', 207.18339407677223),
 ('domain', 190.70420300735924),
 ('automated', 165.3032805126656),
 ('technique', 160.2471885077985),
 ('new', 143.88514994560734),
 ('diagram', 134.98904203711305),
 ('algorithm', 132.22349469447929),
 ('world', 130.97770250604756),
 ('result', 130.22663565463856),
 ('programming', 118.92809566811958),
 ('improve', 113.64581399065618),
 ('approach', 111.13060517035956),
 ('complex', 107.53403556489796)]

In [26]:
finaltokens_dic100[72][0:20]

[('computational', 808.2946829044535),
 ('ai', 306.30626854615315),
 ('model', 286.3469825769834),
 ('thinking', 218.67066487936896),
 ('narrative', 218.41494570462936),
 ('problem', 213.04476967733518),
 ('natural', 168.351319297549),
 ('language', 163.09342570696242),
 ('think', 96.668350482223),
 ('way', 91.1185842370393),
 ('domain', 89.13639275647448),
 ('work', 87.552578682618),
 ('processing', 87.02630773140982),
 ('author', 85.40450046738906),
 ('structure', 83.02452435282217),
 ('linguistic', 82.29755988711075),
 ('sustainability', 81.55825349693268),
 ('approach', 78.69717897750392),
 ('system', 77.81189713923115),
 ('organize', 77.16405515249406)]

In [27]:
relevant_topics_HT.append(72)

In [29]:
print_topics(finaltokens_100, lda_vectorizer2, 15)


Topic 0:
('team', 239.56614982467713)
('design', 217.64005082298974)
('competition', 194.94806473430285)
('student', 181.00613991991872)
('technology', 168.25901500292815)
('robotics', 140.8464478419777)
('disability', 134.89169613989174)
('japan', 124.11877584269413)
('engineering', 120.24706778742284)
('mesh', 112.4324761201544)
('conference', 102.01185139878534)
('individual', 101.63421430519563)
('year', 99.61068252251088)
('sdc', 86.09989185159486)
('assistive', 78.96950654678481)

Topic 1:
('algorithm', 927.901800109486)
('structure', 668.4234300665344)
('sparse', 507.5492081059952)
('application', 450.19782706976457)
('dimensional', 442.04868806142133)
('large', 422.3673884196459)
('problem', 413.81893548687145)
('high', 379.8127496714741)
('method', 374.86900789077737)
('matrix', 328.2596260574378)
('linear', 315.70697333884675)
('analysis', 309.71576754185594)
('representation', 306.0751524056334)
('data', 299.02983351443135)
('code', 267.6165364197405)

Topic 2:
('computing'

maybe: 1,14,45,54,75

In [34]:
finaltokens_dic100[54]

[('language', 2194.6862403033265),
 ('speech', 1090.7339812592747),
 ('text', 870.4834213188653),
 ('natural', 628.3403632686441),
 ('model', 455.3666454375083),
 ('word', 432.2392279854847),
 ('processing', 425.8530788229367),
 ('linguistic', 407.4476615984387),
 ('translation', 320.48028865690054),
 ('annotation', 296.2572609495197),
 ('document', 250.00431840745188),
 ('representation', 238.1075171672555),
 ('information', 231.7379940543126),
 ('recognition', 230.7751303298012),
 ('automatic', 225.64262594289016),
 ('machine', 223.20500049661032),
 ('improve', 222.43395079127018),
 ('human', 220.46370044757265),
 ('system', 220.29179915980626),
 ('speaker', 219.57135388199185),
 ('technology', 201.13615887541965),
 ('nlp', 199.71013117442544),
 ('work', 189.58630017408413),
 ('corpus', 180.3805418254766),
 ('task', 177.31307427671865),
 ('english', 172.1635208957508),
 ('resource', 170.6458623771006),
 ('base', 159.15426128883826),
 ('new', 158.05760161747196),
 ('technique', 154.20

In [18]:
rel_ent_fin = rel_ent_key_list(finaltokens_100, lda_vectorizer2, 1000, relevant_topics_HT)

  topic_logs = np.log(topic_word_matrix[relevant_topics, :].sum(
