In [2]:
import os, nltk
import numpy as np
import pandas as pd

# Data Retrieval

In [3]:
import urllib.request

url = 'https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
filename = 'nips12raw_str602'
urllib.request.urlretrieve(url, filename)

('nips12raw_str602', <http.client.HTTPMessage at 0x7f86860fb490>)

In [4]:
!tar -xzf nips12raw_str602

In [5]:
DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['orig', 'nips06', 'nips08', 'nips00', 'nips04', 'idx', 'nips02', 'README_yann', 'nips10', 'nips11', 'nips01', 'nips03', 'nips09', 'nips12', 'nips07', 'MATLAB_NOTES', 'nips05', 'RAW_DATA_NOTES']


# Load and View Dataset

In [6]:
folders = ['nips{0:02}'.format(i) for i in range(0, 13)]
# Read all texts into a list
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:#seperate 'em with /
            data = f.read()
        papers.append(data)
len(papers)        

1740

 However, it looks like the OCR hasn’t worked perfectly and we have
some missing characters here and there. This is expected, but also makes this task more
challenging!

In [56]:
print(papers[0][:1000])

804 
INTRODUCTION TO A SYSTEM FOR IMPLEMENTING NEURAL NET 
CONNECTIONS ON SIMD ARCHITECTURES 
Sherryl Tomboulian 
Institute for Computer Applications in Science and Engineering 
NASA Langley Research Center, Hampton VA 23665 
ABSTRACT 
Neural networks have attracted much interest recently, and using parallel 
architectures to simulate neural networks is a natural and necessary applica- 
tion. The SIMD model of parallel computation is chosen, because systems of 
this type can be built with large numbers of processing elements. However, 
such systems are not naturally suited to generalized communication. A method 
is proposed that allows an implementation of neural network connections on 
massively parallel SIMD architectures. The key to this system is an algorithm 
that allows the formation of arbitrary connections between the 'neurons '. A 
feature is the ability to add new connections quickly. It also has error recov- 
ery ability and is robust over a variety of network topologies. Si

# Basic Text Wrangling

In [57]:
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]# word tokenization
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers

norm_papers = normalize_corpus(papers)
print(len(norm_papers))

1740


In [58]:
# Viewing a processed paper
print(norm_papers[0][:50])

['introduction', 'system', 'implementing', 'neural', 'net', 'connection', 'simd', 'architecture', 'sherryl', 'tomboulian', 'institute', 'computer', 'application', 'science', 'engineering', 'nasa', 'langley', 'research', 'center', 'hampton', 'va', 'abstract', 'neural', 'network', 'attracted', 'much', 'interest', 'recently', 'using', 'parallel', 'architecture', 'simulate', 'neural', 'network', 'natural', 'necessary', 'applica', 'tion', 'simd', 'model', 'parallel', 'computation', 'chosen', 'system', 'type', 'built', 'large', 'number', 'processing', 'element']


# Text Representation with Feature Engineering¶

we present out text data in thr form of a Bag of Words model with uni-gram and bi-gram, similar to our analyses in the previous section

In [75]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(min_df=20, max_df=0.6,ngram_range=(1, 2),token_pattern=None,tokenizer=lambda doc:doc,preprocessor=lambda doc:doc)
cv_features = cv.fit_transform(norm_papers)
cv_features.shape

(1740, 14408)

In [76]:
# validating vocaublary size
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 14408


# Latent Dirichlet Allocation

In [77]:
from sklearn.decomposition import LatentDirichletAllocation

lda_model = LatentDirichletAllocation(n_components =TOTAL_TOPICS, max_iter=500, max_doc_update_iter=50, learning_method='online'
                                      , batch_size=1740, learning_offset=50., random_state=42, n_jobs=16)
document_topics = lda_model.fit_transform(cv_features)

then obtain the topic-term matrix and build a dataframe from it to showcase
the topics and terms in an easy-to-interpret format.

In [78]:
top_terms = 20
topic_terms = lda_model.components_
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterms = vocabulary[topic_key_term_idxs]
topics = [', '.join(topic) for topic in topic_keyterms]
pd.set_option('display.max_colwidth', -1)
topics_df = pd.DataFrame(topics,columns = ['Terms per Topic'],index=['Topic'+str(t) for t in range(1, TOTAL_TOPICS+1)])
topics_df

  pd.set_option('display.max_colwidth', -1)


Unnamed: 0,Terms per Topic
Topic1,"neuron, circuit, chip, analog, current, signal, voltage, channel, vlsi, implementation, bit, noise, pulse, processor, design, synapse, parallel, fig, line, digital"
Topic2,"image, feature, structure, state, layer, neuron, distribution, local, cell, motion, recognition, node, net, matrix, object, gaussian, sequence, line, size, hidden"
Topic3,"motor, frequency, auditory, sound, template, command, spectrum, acoustic, syllable, control, feedback, amplitude, motor command, song, production, representation, onset, harmonic, temporal, phase"
Topic4,"cell, neuron, response, visual, stimulus, activity, spike, field, synaptic, motion, direction, firing, cortex, signal, orientation, spatial, eye, rate, map, fig"
Topic5,"image, feature, recognition, layer, hidden, task, object, speech, representation, trained, test, classification, net, classifier, level, architecture, class, experiment, rule, node"
Topic6,"state, dynamic, matrix, equation, rule, recurrent, gradient, fixed, neuron, solution, node, signal, hidden, sequence, net, activation, attractor, source, step, connection"
Topic7,"sequence, chain, region, structure, protein, prediction, hmms, site, receptor, gene, class, human, positive, distance, length, mouse, negative, cell, domain, sheet"
Topic8,"memory, word, context, similarity, item, recall, phoneme, activation, probability, representation, short, list, association, address, short term, state, serial, store, storage, phone"
Topic9,"ii, activation, winner, region, take, winner take, texture, ii ii, behavior, self, saliency, competitive, wta, connection, binding, iii, edge, feedback, search, sensor"
Topic10,"state, cell, distribution, neuron, probability, control, response, signal, task, layer, rate, architecture, random, hidden, test, image, fig, change, field, generalization"


In [79]:
pd.options.display.float_format = '{:,.3f}'.format
dt_df = pd.DataFrame(document_topics,  columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,0.58,0.305,0.134,0.016,0.0,0.642,0.239,0.003,0.444,0.172,...,0.601,0.006,0.0,0.0,0.0,0.006,0.0,0.0,0.0,0.0
T2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
T3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.022,0.0,...,0.026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.017,0.0
T4,0.0,0.0,0.079,0.002,0.0,0.079,0.0,0.013,0.232,0.21,...,0.315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.043,0.0
T5,0.054,0.032,0.068,0.444,0.38,0.027,0.129,0.546,0.282,0.355,...,0.014,0.374,0.0,0.199,0.453,0.038,0.025,0.224,0.334,0.029
T6,0.164,0.511,0.448,0.227,0.41,0.114,0.422,0.138,0.0,0.067,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.136,0.01
T7,0.0,0.02,0.0,0.0,0.0,0.0,0.0,0.0,0.011,0.0,...,0.0,0.0,0.0,0.0,0.09,0.0,0.0,0.0,0.055,0.0
T8,0.005,0.005,0.0,0.0,0.0,0.0,0.072,0.0,0.008,0.0,...,0.0,0.0,0.004,0.0,0.006,0.0,0.0,0.006,0.0,0.0
T9,0.006,0.0,0.0,0.0,0.004,0.0,0.053,0.0,0.0,0.0,...,0.025,0.0,0.0,0.011,0.064,0.0,0.0,0.0,0.004,0.0
T10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


we can see some repetition in similar
themes among the topics, which might be an indication that this model is not as good
as our MALLET LDA model

We can now view the research papers having the maximum
contribution of each of the 20 topics

In [80]:
pd.options.display.float_format = '{:,.5f}'.format
pd.set_option('display.max_colwidth', 200)

max_contrib_topics = dt_df.max(axis=0)
dominant_topics = max_contrib_topics.index
contrib_perc = max_contrib_topics.values
document_numbers = [dt_df[dt_df[t] == max_contrib_topics.loc[t]].index[0] for t in dominant_topics]
documents = [papers[i] for i in document_numbers]

results_df = pd.DataFrame({'Dominant Topic': dominant_topics, 'Contribution %': contrib_perc,
                          'Paper Num': document_numbers, 'Topic': topics_df['Terms per Topic'], 
                          'Paper Name': documents})
results_df

Unnamed: 0,Dominant Topic,Contribution %,Paper Num,Topic,Paper Name
Topic1,T1,0.9993,942,"neuron, circuit, chip, analog, current, signal, voltage, channel, vlsi, implementation, bit, noise, pulse, processor, design, synapse, parallel, fig, line, digital","Single Transistor Learning Synapses \nPaul Hasler, Chris Diorio, Bradley A. Minch, Carver Mead \nCalifornia Institute of Technology \nPasadena, CA 91125 \n(SlS) 95- 2S12 \npaul@hobiecat.pcmp.calt..."
Topic2,T2,0.00033,108,"image, feature, structure, state, layer, neuron, distribution, local, cell, motion, recognition, node, net, matrix, object, gaussian, sequence, line, size, hidden",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...
Topic3,T3,0.76997,149,"motor, frequency, auditory, sound, template, command, spectrum, acoustic, syllable, control, feedback, amplitude, motor command, song, production, representation, onset, harmonic, temporal, phase",795 \nSONG LEARNING IN BIRDS \nM. Konishi \nDivision of Biology \nCalifornia Institute of Technology \nABSTRACT\nBirds sing to communicate. Male birds use song to advertise their territories and \...
Topic4,T4,0.99945,1103,"cell, neuron, response, visual, stimulus, activity, spike, field, synaptic, motion, direction, firing, cortex, signal, orientation, spatial, eye, rate, map, fig","Simulation of a Thalamocortical Circuit for \nComputing Directional Heading in the Rat \nHugh T. Blair* \nDepartment of Psychology \nYale University \nNew Haven, CT 06520-8205 \ntadb @minerva. cis..."
Topic5,T5,0.99949,213,"image, feature, recognition, layer, hidden, task, object, speech, representation, trained, test, classification, net, classifier, level, architecture, class, experiment, rule, node","266 Zemel, Mozer and Hinton \nTRAFFIC: Recognizing Objects Using \nHierarchical Reference Frame Transformations \nRichard S. Zemel \nComputer Science Dept. \nUniversity of Toronto \nToronto, ONT M..."
Topic6,T6,0.98993,1084,"state, dynamic, matrix, equation, rule, recurrent, gradient, fixed, neuron, solution, node, signal, hidden, sequence, net, activation, attractor, source, step, connection","Harmony Networks Do Not Work \nRen5 Gourley \nSchool of Computing Science \nSimon Fraser University \nBurnaby, B.C., V5A 1S6, Canada \ngourley@mprgate.mpr.ca \nAbstract \nHarmony networks have be..."
Topic7,T7,0.99956,266,"sequence, chain, region, structure, protein, prediction, hmms, site, receptor, gene, class, human, positive, distance, length, mouse, negative, cell, domain, sheet","A Neural Network to Detect \nHomologies in Proteins \nYoshua Bengio \nSchool of Computer Science \nMcGill University \nMontreal, Canada H3A 2A7 \nSamy Bengio \nDepartement d'Informatique \nUnivers..."
Topic8,T8,0.97427,889,"memory, word, context, similarity, item, recall, phoneme, activation, probability, representation, short, list, association, address, short term, state, serial, store, storage, phone","A solvable connectionist model of \nimmediate recall of ordered lists \nNell Burgess \nDepartment of Anatomy, University College London \nLondon WCiE 6BT, England \n(e-mail: n .burgessucl. ac. uk..."
Topic9,T9,0.99929,281,"ii, activation, winner, region, take, winner take, texture, ii ii, behavior, self, saliency, competitive, wta, connection, binding, iii, edge, feedback, search, sensor","44 Beer and Chiei \nNeural \nImplementation of Motivated Behavior: \nFeeding in an Artificial Insect \nRandall D. Beer t,2 and Hillel J. Chiel 2 \nDepartments of t Computer Engineering and Science..."
Topic10,T10,0.00033,108,"state, cell, distribution, neuron, probability, control, response, signal, task, layer, rate, architecture, random, hidden, test, image, fig, change, field, generalization",794 \nNEURAL ARCHITECTURE \nValentino Braitenberg \nMax Planck Institute \nFederal Republic of Germany \nABSTRACT\nWhile we are waiting for the ultimate biophysics of cell membranes and synapses \...


can see that some topics have a very
poor representation of almost 0% in the corpus and so we see the same paper

The topics with a good
contribution (almost 100% dominance) showcase papers that are closely correlated
with the theme conveyed by the corresponding topic, including reinforcement learning,
Bayesian and Gaussian mixture models, neural models on VLSI, and transistors.