In [8]:
import pandas as pd

data = pd.read_csv('NSF_awardtopics.txt', error_bad_lines=False)
data.columns = ['NSF Award Title Non-phrases']
data['index'] = data.index
documents = data
print(data)

                            NSF Award Title Non-phrases  index
0     application applications basic premise data ba...      0
1     algorithm implementation algorithms analysis a...      1
2     assumption basic problems certain special case...      2
3     active updating amount analysis analytical res...      3
4     act act theory computer computer simulator com...      4
5     artificial intelligence basic query languages ...      5
6     abstract prosidic categories abstract prosodic...      6
7      dynamical  processes  energy -processing  inf...      7
8     ability analytically derived concepts approach...      8
9     acquisition ambiguous examples classification ...      9
10    advance applications artificial intelligence r...     10
11    addition arm assembly capabilities complex rob...     11
12    active experimentation additional knowledge ap...     12
13    addition areas cad/cam commercial reality comp...     13
14    ability addition analogical processing boilers...

b'Skipping line 946: expected 1 fields, saw 2\nSkipping line 1957: expected 1 fields, saw 2\nSkipping line 2826: expected 1 fields, saw 2\nSkipping line 4376: expected 1 fields, saw 2\nSkipping line 5810: expected 1 fields, saw 3\nSkipping line 5811: expected 1 fields, saw 3\n'


In [9]:
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
np.random.seed(2018)

In [10]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ywu6\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [11]:
stemmer = SnowballStemmer('english')
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) >= 3:
            result.append(lemmatize_stemming(token))
    return result

In [12]:
doc_sample = documents[documents['index'] == 250].values[0][0]
print('original document: ')
words = []
for word in doc_sample.split(' '):
    words.append(word)
print(words)
print('\n\n tokenized and lemmatized document: ')
print(preprocess(doc_sample))

original document: 
['collision-free', 'motion', 'completeness', 'complex', 'ideas', 'computer', 'science', 'cornell', 'degrees', 'dr.', 'donald', 'error', 'detection', 'excellent', 'speaker', 'exceptional', 'work', 'freedom', 'influential', 'contribution', 'inspiring', 'graduate', 'advisor', 'kinodynamic', 'approximation', 'algorithms', 'master', 'mathematics', 'mit', 'new', 'graduate', 'course', 'optimal', 'robotic', 'motion', 'planning', 'planning', 'system', 'presidential', 'young', 'investigator', 'award', 'presidential', 'young', 'investigator', 'award', 'project', 'promise', 'recovery', 'research', 'rigorous', 'proof', 'robot', 'arms', 'robot', 'motion', 'planning', 'robotics', 'russian', 'literature', 'straightforward', 'manner', 'uncertain', 'domains', 'undergraduate', 'degree', 'work', 'yale', '']


 tokenized and lemmatized document: 
['collis', 'free', 'motion', 'complet', 'complex', 'idea', 'scienc', 'cornel', 'degre', 'donald', 'error', 'detect', 'excel', 'speaker', 'exce

In [13]:
processed_docs = documents['NSF Award Title Non-phrases'].map(preprocess)

In [14]:
processed_docs[:10]

0    [applic, applic, basic, premis, data, base, su...
1    [algorithm, implement, algorithm, analysi, app...
2    [assumpt, basic, problem, certain, special, ca...
3    [activ, updat, analysi, analyt, result, archiv...
4    [act, act, theori, simul, teach, system, effec...
5    [artifici, intellig, basic, queri, languag, sc...
6    [abstract, prosid, categori, abstract, prosod,...
7    [dynam, process, energi, process, inform, mech...
8    [abil, analyt, deriv, concept, approach, artif...
9    [acquisit, ambigu, exampl, classif, classif, d...
Name: NSF Award Title Non-phrases, dtype: object

In [15]:
dictionary = gensim.corpora.Dictionary(processed_docs)

In [16]:
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

In [17]:
bow_doc_200 = bow_corpus[200]

for i in range(len(bow_doc_200)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_200[i][0], 
                                                     dictionary[bow_doc_200[i][0]], 
                                                     bow_doc_200[i][1]))

Word 19 ("studi") appears 1 time.
Word 22 ("system") appears 1 time.
Word 24 ("analysi") appears 1 time.
Word 72 ("imag") appears 3 time.
Word 77 ("local") appears 1 time.
Word 87 ("recognit") appears 1 time.
Word 97 ("stereo") appears 1 time.
Word 98 ("structur") appears 1 time.
Word 103 ("vision") appears 1 time.
Word 158 ("theori") appears 1 time.
Word 178 ("match") appears 1 time.
Word 234 ("represent") appears 1 time.
Word 254 ("biolog") appears 1 time.
Word 271 ("energi") appears 1 time.
Word 343 ("multipl") appears 1 time.
Word 403 ("scale") appears 1 time.
Word 410 ("set") appears 1 time.
Word 650 ("tempor") appears 1 time.
Word 735 ("grant") appears 1 time.
Word 819 ("compress") appears 1 time.
Word 857 ("textur") appears 1 time.
Word 1066 ("cross") appears 1 time.
Word 1215 ("filter") appears 1 time.
Word 1266 ("discontinu") appears 1 time.
Word 1390 ("transform") appears 1 time.
Word 1898 ("discrimin") appears 1 time.
Word 1899 ("wavelet") appears 3 time.
Word 1900 ("zero") 

In [18]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

In [19]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break

[(0, 0.1339125775909705),
 (1, 0.1386083656890697),
 (2, 0.15253811750038598),
 (3, 0.14035240208466113),
 (4, 0.04010163575335318),
 (5, 0.20298870938999058),
 (6, 0.24931968873366708),
 (7, 0.1795533691438295),
 (8, 0.14737521421206454),
 (9, 0.5120353116075402),
 (10, 0.20537297139559396),
 (11, 0.2936340106937988),
 (12, 0.08054858824528914),
 (13, 0.1364277375974575),
 (14, 0.3209511913532448),
 (15, 0.09377233559606106),
 (16, 0.28886412678393164),
 (17, 0.19580488634904608),
 (18, 0.016019650289156705),
 (19, 0.07540109644963205),
 (20, 0.2779899971764848),
 (21, 0.0989745687829497),
 (22, 0.13665896037909775)]


In [20]:
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=20, id2word=dictionary, passes=2, workers=2)

In [21]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.021*"research" + 0.013*"behavior" + 0.010*"project" + 0.009*"design" + 0.009*"technolog" + 0.008*"comput" + 0.007*"new" + 0.007*"data" + 0.007*"student" + 0.006*"learn"
Topic: 1 
Words: 0.022*"research" + 0.016*"student" + 0.013*"project" + 0.009*"technolog" + 0.009*"new" + 0.009*"inform" + 0.009*"human" + 0.008*"imag" + 0.008*"field" + 0.007*"collabor"
Topic: 2 
Words: 0.045*"research" + 0.029*"student" + 0.023*"confer" + 0.013*"doctor" + 0.012*"particip" + 0.012*"workshop" + 0.009*"intern" + 0.008*"consortium" + 0.008*"field" + 0.008*"communiti"
Topic: 3 
Words: 0.020*"research" + 0.013*"data" + 0.012*"inform" + 0.011*"project" + 0.009*"collabor" + 0.007*"work" + 0.007*"knowledg" + 0.007*"web" + 0.007*"develop" + 0.006*"student"
Topic: 4 
Words: 0.015*"research" + 0.010*"model" + 0.008*"student" + 0.008*"project" + 0.008*"level" + 0.008*"inform" + 0.007*"energi" + 0.007*"agent" + 0.007*"video" + 0.006*"time"
Topic: 5 
Words: 0.019*"data" + 0.017*"algorithm" + 0.014

In [22]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=20, id2word=dictionary, passes=2, workers=4)

for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.003*"data" + 0.003*"imag" + 0.003*"visual" + 0.003*"languag" + 0.003*"model" + 0.003*"learn" + 0.002*"speech" + 0.002*"network" + 0.002*"algorithm" + 0.002*"interact"
Topic: 1 
Words: 0.003*"confer" + 0.003*"data" + 0.003*"student" + 0.003*"network" + 0.003*"doctor" + 0.002*"languag" + 0.002*"robot" + 0.002*"learn" + 0.002*"system" + 0.002*"web"
Topic: 2 
Words: 0.003*"data" + 0.003*"imag" + 0.002*"databas" + 0.002*"robot" + 0.002*"model" + 0.002*"learn" + 0.002*"visual" + 0.002*"comput" + 0.002*"queri" + 0.002*"agent"
Topic: 3 
Words: 0.004*"data" + 0.003*"robot" + 0.003*"visual" + 0.003*"human" + 0.003*"databas" + 0.003*"queri" + 0.002*"learn" + 0.002*"user" + 0.002*"vision" + 0.002*"inform"
Topic: 4 
Words: 0.002*"data" + 0.002*"robot" + 0.002*"model" + 0.002*"human" + 0.002*"object" + 0.002*"agent" + 0.002*"network" + 0.002*"technolog" + 0.002*"user" + 0.002*"system"
Topic: 5 
Words: 0.003*"robot" + 0.002*"workflow" + 0.002*"data" + 0.002*"human" + 0.002*"model" 

In [23]:
test_title = 'Computer Vision'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.28191640973091125	 Topic: 0.032*"robot" + 0.014*"research" + 0.012*"human" + 0.010*"control" + 0.009*"task"
Score: 0.2680550813674927	 Topic: 0.017*"human" + 0.015*"research" + 0.014*"model" + 0.012*"system" + 0.010*"robot"
Score: 0.025001585483551025	 Topic: 0.022*"research" + 0.016*"student" + 0.013*"project" + 0.009*"technolog" + 0.009*"new"
Score: 0.025001585483551025	 Topic: 0.015*"research" + 0.010*"model" + 0.008*"student" + 0.008*"project" + 0.008*"level"
Score: 0.025001585483551025	 Topic: 0.019*"data" + 0.017*"algorithm" + 0.014*"research" + 0.014*"imag" + 0.010*"applic"
Score: 0.025001585483551025	 Topic: 0.020*"research" + 0.013*"model" + 0.011*"interact" + 0.009*"design" + 0.009*"project"
Score: 0.025001585483551025	 Topic: 0.015*"research" + 0.014*"inform" + 0.012*"robot" + 0.011*"algorithm" + 0.010*"learn"
Score: 0.025001585483551025	 Topic: 0.021*"research" + 0.012*"algorithm" + 0.011*"data" + 0.010*"new" + 0.009*"learn"
Score: 0.025001585483551025	 Topic: 0.02

In [24]:
test_title = 'Machine Learning'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model_tfidf[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model_tfidf.print_topic(index, 5)))

Score: 0.6833124756813049	 Topic: 0.003*"data" + 0.002*"model" + 0.002*"learn" + 0.002*"inform" + 0.002*"databas"
Score: 0.016667766496539116	 Topic: 0.004*"robot" + 0.003*"imag" + 0.003*"data" + 0.003*"network" + 0.003*"learn"
Score: 0.016667764633893967	 Topic: 0.003*"data" + 0.003*"imag" + 0.003*"visual" + 0.003*"languag" + 0.003*"model"
Score: 0.016667764633893967	 Topic: 0.003*"confer" + 0.003*"data" + 0.003*"student" + 0.003*"network" + 0.003*"doctor"
Score: 0.016667764633893967	 Topic: 0.003*"data" + 0.003*"imag" + 0.002*"databas" + 0.002*"robot" + 0.002*"model"
Score: 0.016667764633893967	 Topic: 0.004*"data" + 0.003*"robot" + 0.003*"visual" + 0.003*"human" + 0.003*"databas"
Score: 0.016667764633893967	 Topic: 0.002*"data" + 0.002*"robot" + 0.002*"model" + 0.002*"human" + 0.002*"object"
Score: 0.016667764633893967	 Topic: 0.003*"robot" + 0.002*"workflow" + 0.002*"data" + 0.002*"human" + 0.002*"model"
Score: 0.016667764633893967	 Topic: 0.003*"robot" + 0.003*"data" + 0.002*"desi

In [25]:
test_title = 'Models'
bow_vector = dictionary.doc2bow(preprocess(test_title))

for index, score in sorted(lda_model[bow_vector], key=lambda tup: -1*tup[1]):
    print("Score: {}\t Topic: {}".format(score, lda_model.print_topic(index, 5)))

Score: 0.5249949097633362	 Topic: 0.017*"human" + 0.015*"research" + 0.014*"model" + 0.012*"system" + 0.010*"robot"
Score: 0.02500026859343052	 Topic: 0.020*"research" + 0.013*"model" + 0.011*"interact" + 0.009*"design" + 0.009*"project"
Score: 0.02500026673078537	 Topic: 0.021*"research" + 0.013*"behavior" + 0.010*"project" + 0.009*"design" + 0.009*"technolog"
Score: 0.02500026673078537	 Topic: 0.022*"research" + 0.016*"student" + 0.013*"project" + 0.009*"technolog" + 0.009*"new"
Score: 0.02500026673078537	 Topic: 0.045*"research" + 0.029*"student" + 0.023*"confer" + 0.013*"doctor" + 0.012*"particip"
Score: 0.02500026673078537	 Topic: 0.020*"research" + 0.013*"data" + 0.012*"inform" + 0.011*"project" + 0.009*"collabor"
Score: 0.02500026673078537	 Topic: 0.015*"research" + 0.010*"model" + 0.008*"student" + 0.008*"project" + 0.008*"level"
Score: 0.02500026673078537	 Topic: 0.019*"data" + 0.017*"algorithm" + 0.014*"research" + 0.014*"imag" + 0.010*"applic"
Score: 0.02500026673078537	 Top