In [1]:
import os 
import numpy as np
import pandas as pd

In [2]:
import urllib.request

url = 'https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
filename = 'nips12raw_str602'
urllib.request.urlretrieve(url, filename)

('nips12raw_str602', <http.client.HTTPMessage at 0x7fe75b8f9100>)

In [3]:
!tar -xzf nips12raw_str602

In [4]:
DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['orig', 'nips06', 'nips08', 'nips00', 'nips04', 'idx', 'nips02', 'README_yann', 'nips10', 'nips11', 'nips01', 'nips03', 'nips09', 'nips12', 'nips07', 'MATLAB_NOTES', 'nips05', 'RAW_DATA_NOTES']


# Load and View Dataset

In [5]:
folders = ['nips{0:02}'.format(i) for i in range(0, 13)]
# Read all texts into a list
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:#seperate 'em with /
            data = f.read()
        papers.append(data)
len(papers)        

print(papers[0][:1000])

804 
INTRODUCTION TO A SYSTEM FOR IMPLEMENTING NEURAL NET 
CONNECTIONS ON SIMD ARCHITECTURES 
Sherryl Tomboulian 
Institute for Computer Applications in Science and Engineering 
NASA Langley Research Center, Hampton VA 23665 
ABSTRACT 
Neural networks have attracted much interest recently, and using parallel 
architectures to simulate neural networks is a natural and necessary applica- 
tion. The SIMD model of parallel computation is chosen, because systems of 
this type can be built with large numbers of processing elements. However, 
such systems are not naturally suited to generalized communication. A method 
is proposed that allows an implementation of neural network connections on 
massively parallel SIMD architectures. The key to this system is an algorithm 
that allows the formation of arbitrary connections between the 'neurons '. A 
feature is the ability to add new connections quickly. It also has error recov- 
ery ability and is robust over a variety of network topologies. Si

# Basic Text Wrangling

In [6]:
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]# word tokenization
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers

norm_papers = normalize_corpus(papers)
print(len(norm_papers))

1740


In [7]:
# Viewing a processed paper
print(norm_papers[0][:50])

['introduction', 'system', 'implementing', 'neural', 'net', 'connection', 'simd', 'architecture', 'sherryl', 'tomboulian', 'institute', 'computer', 'application', 'science', 'engineering', 'nasa', 'langley', 'research', 'center', 'hampton', 'va', 'abstract', 'neural', 'network', 'attracted', 'much', 'interest', 'recently', 'using', 'parallel', 'architecture', 'simulate', 'neural', 'network', 'natural', 'necessary', 'applica', 'tion', 'simd', 'model', 'parallel', 'computation', 'chosen', 'system', 'type', 'built', 'large', 'number', 'processing', 'element']


# Text Representation with Feature Engineering

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(min_df=20, max_df=0.6,ngram_range=(1, 2),token_pattern=None,tokenizer=lambda doc:doc,preprocessor=lambda doc:doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

(1740, 14408)

In [9]:
# validating vocaublary size
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 14408


# Non-Negative Matrix Factorization

NMF:
    <br>
    1. which is another matrix decomposition technique similar to SVD but operates on non-negative matrices and works well for multlivairate data
    2. the objective of NMF is to find 2 non-negative matrix factors, W and H, such that when they are multiplied,they can approximaltey reconstruct V

we can build an NMF based topic model using the following snippet on out toy corpus,whihc gives us the feature names and their weights just like in LDA

In [11]:
from sklearn.decomposition import NMF

TOTAL_TOPICS = 20
nmf_model = NMF(n_components=TOTAL_TOPICS,solver='cd', max_iter=500,random_state=42, alpha=0.1,l1_ratio=0.85)
document_topics = nmf_model.fit_transform(cv_features)



Now that we have our model trained, we can look at the generated topics using the following code:

Generated topics from our NMF model

In [13]:
top_terms = 20
topic_terms = nmf_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics, columns = ['Terms per Topic'], index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Terms per Topic
Topic1,"bound, generalization, size, let, optimal, solution, equation, theorem, approximation, gradient, class, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum"
Topic2,"neuron, synaptic, connection, potential, dynamic, activity, synapsis, excitatory, layer, simulation, synapse, inhibitory, delay, biological, equation, state, et, et al, fig, activation"
Topic3,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, decision"
Topic4,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, transformation, representation, visual, surface, database"
Topic5,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, trained, propagation, connection, back propagation, activation, representation, generalization, output unit, neural net, training set, test, learn"
Topic6,"cell, firing, head, direction, response, rat, layer, cortex, activity, spatial, synaptic, inhibitory, synapsis, simulation, cue, property, complex, active, lot, cortical"
Topic7,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, level, test, acoustic, experiment, letter, segmentation, state"
Topic8,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig"
Topic9,"control, controller, trajectory, motor, dynamic, movement, task, forward, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant"
Topic10,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, pp, cmos, element"


Based on the topics depicted in the Figure:
    <br>
    1. there are no major repitions of topics and each topic talks about a clear and distinct theme
    2. the results from the NMF topic model are definitely better than what we obtained from LDA in Scikit-Learn
    3. we can determine the dominance of topics in each research paper but, in case of NMF these are determined by absolute scores and not percenteges

In [14]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics, columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.head(10)

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10,T11,T12,T13,T14,T15,T16,T17,T18,T19,T20
0,0.095,1.598,0.0,0.01,0.372,0.0,0.0,0.0,0.156,0.469,0.0,0.188,0.623,0.207,0.064,0.0,0.212,0.0,0.0,1.14
1,0.32,1.468,0.554,0.0,0.0,0.0,0.11,0.281,0.0,0.208,0.0,0.0,0.598,0.0,0.109,0.153,0.089,0.0,0.059,0.86
2,0.852,0.148,0.694,0.038,0.0,0.017,0.0,0.234,0.244,0.332,0.0,0.123,0.0,0.145,0.0,0.127,0.374,0.032,0.026,0.437
3,0.115,0.139,0.0,0.0,0.607,0.013,0.013,0.018,0.033,0.0,0.0,0.075,0.808,0.013,0.059,0.274,0.009,0.028,0.0,0.0
4,0.341,0.003,0.0,0.001,0.365,0.0,0.0,0.061,0.053,0.0,0.0,0.597,0.0,0.037,0.039,0.0,0.068,0.0,0.297,0.0
5,0.265,0.0,0.147,0.684,0.0,0.0,0.0,0.127,0.052,1.039,0.0,0.083,0.399,0.032,0.0,0.089,0.793,0.034,0.0,0.0
6,0.0,0.563,0.674,0.0,0.279,0.0,0.0,0.046,0.0,0.024,0.0,0.0,0.0,0.0,0.095,0.0,0.0,0.0,0.0,2.707
7,0.266,0.0,0.0,0.039,0.551,0.0,0.043,0.466,0.0,0.0,0.018,0.063,0.0,0.466,0.07,0.13,0.0,0.007,0.202,0.0
8,0.0,0.267,0.0,0.01,0.04,0.073,0.365,0.914,0.311,0.402,0.137,0.051,0.06,0.065,0.013,0.0,0.03,0.029,0.188,0.21
9,0.403,0.004,0.041,1.261,0.132,0.021,0.0,0.364,0.071,0.062,0.0,0.043,0.0,0.03,0.0,0.054,0.345,0.131,0.005,0.008


leveraging the document_topic matrix, we can determine the most relevant paper for each topic based on the topic dominance scores by using the following code

In [15]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_score_topics = dt_df.max(axis=0)
dominant_topics = max_score_topics.index
term_score = max_score_topics.values
document_numbers = [dt_df[dt_df[t] == max_score_topics.loc[t]].index[0] for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Max Score': term_score,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Max Score,Paper Num,Topic,Paper Name
Topic1,T1,1.66103,1128,"bound, generalization, size, let, optimal, solution, equation, theorem, approximation, gradient, class, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","A Bound on the Error of Cross Validation Using \nthe Approximation and Estimation Rates, with \nConsequences for the Training-Test Split \nMichael Kearns \nAT&T Research \nABSTRACT\n1 INTRODUCTION..."
Topic2,T2,3.56069,1681,"neuron, synaptic, connection, potential, dynamic, activity, synapsis, excitatory, layer, simulation, synapse, inhibitory, delay, biological, equation, state, et, et al, fig, activation","Predictive Sequence Learning in Recurrent \nNeocortical Circuits* \nR. P. N. Rao \nComputational Neurobiology Lab and \nSloan Center for Theoretical Neurobiology \nThe Salk Institute, La Jolla, CA..."
Topic3,T3,5.85746,1286,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","Reinforcement Learning for Mixed \nOpen-loop and Closed-loop Control \nEric A. Hansen, Andrew G. Barto, and Shlomo Zilbersteln \nDepartment of Computer Science \nUniversity of Massachusetts \nAmhe..."
Topic4,T4,3.94621,1645,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, transformation, representation, visual, surface, database",Image representations for facial expression \ncoding \nMarian Stewart Bartlett* \nU.C. San Diego \nmarnisalk. edu \nJavier R. Movellan \nU.C. San Diego \nmovellancogsc. ucsd. edu \nPaul Ekman \n...
Topic5,T5,3.01659,86,"hidden, layer, net, hidden unit, task, hidden layer, architecture, back, trained, propagation, connection, back propagation, activation, representation, generalization, output unit, neural net, tr...","5O5 \nCONNECTING TO THE PAST \nBruce A. MacDonald, Assistant Professor \nKnowledge Sciences Laboratory, Computer Science Department \nThe University of Calgary, 2500 University Drive NW \nCalgary,..."
Topic6,T6,7.56565,66,"cell, firing, head, direction, response, rat, layer, cortex, activity, spatial, synaptic, inhibitory, synapsis, simulation, cue, property, complex, active, lot, cortical","317 \nPARTITIONING OF SENSORY DATA BY A COPTICAI, NETWOPK  \nRichard Granger, Jos Ambros-Ingerson, Howard Henry, Gary Lynch \nCenter for the Neurobiology of Learning and Memory \nUniversity of..."
Topic7,T7,4.93454,1374,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, level, test, acoustic, experiment, letter, segmentation, state","Comparison of Human and Machine Word \nRecognition \nM. Schenkel \nDept of Electrical Eng. \nUniversity of Sydney \nSydney, NSW 2006, Australia \nschenkel@sedal.usyd.edu.au \nC. Latimer \nDept of ..."
Topic8,T8,3.60644,275,"signal, noise, source, filter, component, frequency, channel, speech, matrix, independent, separation, sound, ica, phase, eeg, blind, auditory, dynamic, delay, fig","232 Sejnowski, Yuhas, Goldstein and Jenkins \nCombining Visual and \nwith a Neural Network \nAcoustic Speech Signals \nImproves Intelligibility \nT.J. Sejnowski \nThe Salk Institute \nand \nDepart..."
Topic9,T9,4.83627,886,"control, controller, trajectory, motor, dynamic, movement, task, forward, feedback, arm, inverse, position, robot, architecture, hand, force, adaptive, change, command, plant","An Integrated Architecture of Adaptive Neural Network \nControl for Dynamic Systems \nLiu Ke '2 Robert L. Tokaf Brian D.McVey z \nCenter for Nonlinear Studies, 2Applied Theoretical Physics Divis..."
Topic10,T10,2.96307,1689,"circuit, chip, current, analog, voltage, vlsi, gate, threshold, transistor, pulse, design, implementation, synapse, bit, digital, device, analog vlsi, pp, cmos, element","Kirchoff Law Markov Fields for Analog \nCircuit Design \nRichard M. Golden * \nRMG Consulting Inc. \n2000 Fresno Road, Plano, Texas 75074 \nRMG CONS UL T@A OL. COM, \nwww. neural-network. corn \nA..."


The outputs depicted in Figure 6-18 clearly show that the NMF model is much better than the LDA model:
    <br>
    1. with each topic being strongly correlated as the central theme of the research paper where it has maximum dominance
    2. What we have observed is that non-negative matrix factorization works the best even with small corpora, with few documents compared to the other methods
    3. . But again, this depends on the type of data you are dealing with.

# Predicting Topics for New Research papers

In [16]:
import glob
# papers manually downloaded from NIPS 16
# https://papers.nips.cc/book/advances-in-neural-information-processing-systems-29-2016

new_paper_files = glob.glob('./test_data/nips16*.txt')
new_papers = []
for fn in new_paper_files:
    with open(fn, encoding='utf-8', errors='ignore', mode='r+') as f:
        data = f.read()
        new_papers.append(data)
              
print('Total New Papers:', len(new_papers))

Total New Papers: 4


In [17]:
norm_new_papers = normalize_corpus(new_papers)
cv_new_features = cv.transform(norm_new_papers)
cv_new_features.shape

(4, 14408)

In [18]:
topic_predictions = nmf_model.transform(cv_new_features)
best_topics = [[(topic, round(sc, 3)) 
                    for topic, sc in sorted(enumerate(topic_predictions[i]), 
                                            key=lambda row: -row[1])[:2]] 
                        for i in range(len(topic_predictions))]
best_topics

[[(3, 2.149), (1, 1.343)],
 [(0, 1.127), (15, 0.836)],
 [(3, 3.066), (6, 2.212)],
 [(2, 4.14), (0, 0.87)]]

In [19]:
results_df = pd.DataFrame()
results_df['Papers'] = range(1, len(new_papers)+1)
results_df['Dominant Topics'] = [[topic_num+1 for topic_num, sc in item] for item in best_topics]
res = results_df.set_index(['Papers'])['Dominant Topics'].apply(pd.Series).stack().reset_index(level=1, drop=True)
results_df = pd.DataFrame({'Dominant Topics': res.values}, index=res.index)
results_df['Topic Score'] = [topic_sc for topic_list in 
                                        [[round(sc*100, 2) 
                                              for topic_num, sc in item] 
                                                 for item in best_topics] 
                                    for topic_sc in topic_list]

results_df['Topic Desc'] = [topics_df.iloc[t-1]['Terms per Topic'] for t in results_df['Dominant Topics'].values]
results_df['Paper Desc'] = [new_papers[i-1][:200] for i in results_df.index.values]

results_df

Unnamed: 0_level_0,Dominant Topics,Topic Score,Topic Desc,Paper Desc
Papers,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,4,214.9,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, transformation, representation, visual, surface, database","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
1,2,134.3,"neuron, synaptic, connection, potential, dynamic, activity, synapsis, excitatory, layer, simulation, synapse, inhibitory, delay, biological, equation, state, et, et al, fig, activation","Automated scalable segmentation of neurons from\nmultispectral images\nUygar Sümbül\nGrossman Center for the Statistics of Mind\nand Dept. of Statistics, Columbia University\nDouglas Roossien Jr.\..."
2,1,112.7,"bound, generalization, size, let, optimal, solution, equation, theorem, approximation, gradient, class, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","Cooperative Graphical Models\nJosip Djolonga\nDept. of Computer Science, ETH Zurich ¨\njosipd@inf.ethz.ch\nStefanie Jegelka\nCSAIL, MIT\nstefje@mit.edu\nSebastian Tschiatschek\nDept. of Computer S..."
2,16,83.6,"distribution, probability, gaussian, mixture, variable, density, likelihood, prior, bayesian, component, posterior, em, log, estimate, sample, approximation, estimation, matrix, conditional, maximum","Cooperative Graphical Models\nJosip Djolonga\nDept. of Computer Science, ETH Zurich ¨\njosipd@inf.ethz.ch\nStefanie Jegelka\nCSAIL, MIT\nstefje@mit.edu\nSebastian Tschiatschek\nDept. of Computer S..."
3,4,306.6,"image, face, pixel, recognition, local, distance, scale, digit, texture, filter, scene, vision, facial, pca, edge, transformation, representation, visual, surface, database","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."
3,7,221.2,"word, recognition, speech, context, hmm, speaker, speech recognition, character, phoneme, probability, frame, sequence, rate, level, test, acoustic, experiment, letter, segmentation, state","Unsupervised Learning of Spoken Language with\nVisual Context\nDavid Harwath, Antonio Torralba, and James R. Glass\nComputer Science and Artificial Intelligence Laboratory\nMassachusetts Institute..."
4,3,414.0,"state, action, policy, step, optimal, reinforcement, transition, reinforcement learning, probability, reward, dynamic, value function, markov, machine, task, agent, finite, iteration, sequence, de...","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."
4,1,87.0,"bound, generalization, size, let, optimal, solution, equation, theorem, approximation, gradient, class, xi, loss, rate, matrix, convergence, theory, dimension, sample, minimum","PAC Reinforcement Learning with Rich Observations\nAkshay Krishnamurthy\nUniversity of Massachusetts, Amherst\nAmherst, MA, 01003\nakshay@cs.umass.edu\nAlekh Agarwal\nMicrosoft Research\nNew York,..."


# Persisting Model and Transformers

In [None]:
import dill

with open('nmf_model.pkl', 'wb') as f:
    dill.dump(nmf_model, f)
with open('cv_features.pkl', 'wb') as f:
    dill.dump(cv_features, f)
with open('cv.pkl', 'wb') as f:
    dill.dump(cv, f)