In [2]:
import os, nltk
import numpy as np
import pandas as pd

# Data Retrieval

In [3]:
import urllib.request

url = 'https://cs.nyu.edu/~roweis/data/nips12raw_str602.tgz'
filename = 'nips12raw_str602'
urllib.request.urlretrieve(url, filename)

('nips12raw_str602', <http.client.HTTPMessage at 0x7f86860fb490>)

In [4]:
!tar -xzf nips12raw_str602

In [5]:
DATA_PATH = 'nipstxt/'
print(os.listdir(DATA_PATH))

['orig', 'nips06', 'nips08', 'nips00', 'nips04', 'idx', 'nips02', 'README_yann', 'nips10', 'nips11', 'nips01', 'nips03', 'nips09', 'nips12', 'nips07', 'MATLAB_NOTES', 'nips05', 'RAW_DATA_NOTES']


# Load and View Dataset

In [6]:
folders = ['nips{0:02}'.format(i) for i in range(0, 13)]
# Read all texts into a list
papers = []
for folder in folders:
    file_names = os.listdir(DATA_PATH + folder)
    for file_name in file_names:
        with open(DATA_PATH + folder + '/' + file_name, encoding='utf-8', errors='ignore', mode='r+') as f:#seperate 'em with /
            data = f.read()
        papers.append(data)
len(papers)        

1740

 However, it looks like the OCR hasn’t worked perfectly and we have
some missing characters here and there. This is expected, but also makes this task more
challenging!

In [56]:
print(papers[0][:1000])

804 
INTRODUCTION TO A SYSTEM FOR IMPLEMENTING NEURAL NET 
CONNECTIONS ON SIMD ARCHITECTURES 
Sherryl Tomboulian 
Institute for Computer Applications in Science and Engineering 
NASA Langley Research Center, Hampton VA 23665 
ABSTRACT 
Neural networks have attracted much interest recently, and using parallel 
architectures to simulate neural networks is a natural and necessary applica- 
tion. The SIMD model of parallel computation is chosen, because systems of 
this type can be built with large numbers of processing elements. However, 
such systems are not naturally suited to generalized communication. A method 
is proposed that allows an implementation of neural network connections on 
massively parallel SIMD architectures. The key to this system is an algorithm 
that allows the formation of arbitrary connections between the 'neurons '. A 
feature is the ability to add new connections quickly. It also has error recov- 
ery ability and is robust over a variety of network topologies. Si

# Basic Text Wrangling

In [57]:
import nltk

stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]# word tokenization
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
            
    return norm_papers

norm_papers = normalize_corpus(papers)
print(len(norm_papers))

1740


In [58]:
# Viewing a processed paper
print(norm_papers[0][:50])

['introduction', 'system', 'implementing', 'neural', 'net', 'connection', 'simd', 'architecture', 'sherryl', 'tomboulian', 'institute', 'computer', 'application', 'science', 'engineering', 'nasa', 'langley', 'research', 'center', 'hampton', 'va', 'abstract', 'neural', 'network', 'attracted', 'much', 'interest', 'recently', 'using', 'parallel', 'architecture', 'simulate', 'neural', 'network', 'natural', 'necessary', 'applica', 'tion', 'simd', 'model', 'parallel', 'computation', 'chosen', 'system', 'type', 'built', 'large', 'number', 'processing', 'element']


# Text Representation with Feature Engineering¶

we present out text data in thr form of a Bag of Words model with uni-gram and bi-gram, similar to our analyses in the previous section

In [59]:
from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(min_df=20, max_df=0.6,ngram_range=(1, 2),token_pattern=None,tokenizer=lambda doc:doc,preprocessor=lambda doc:doc)
cv_feature = cv.fit_transform(norm_papers)
cv_feature.shape

(1740, 14408)

In [60]:
# validating vocaublary size
vocabulary = np.array(cv.get_feature_names())
print('Total Vocabulary Size:', len(vocabulary))

Total Vocabulary Size: 14408


# Latent Semantic Indexing

In [61]:
from sklearn.decomposition import TruncatedSVD

TOTAL_TOPICS = 20
lsi_model = TruncatedSVD(n_components=TOTAL_TOPICS, n_iter=500, random_state=42)
document_topics = lsi_model.fit_transform(cv_feature)

In [62]:
topic_terms = lsi_model.components_
topic_terms.shape

(20, 14408)

In [63]:
top_terms = 20
topic_key_term_idxs = np.argsort(-np.absolute(topic_terms), axis=1)[:, :top_terms]
topic_keyterm_weights = np.array([topic_terms[row, columns] for row, columns in list(zip(np.arange(TOTAL_TOPICS), topic_key_term_idxs))])
topic_keyterms = vocabulary[topic_key_term_idxs]
topic_keyterms_weights = list(zip(topic_keyterms, topic_keyterm_weights))
for n in range(TOTAL_TOPICS):
    print('Topic #'+str(n+1)+':')
    print('='*50)
    d1 = []
    d2 = []
    terms, weights = topic_keyterms_weights[n]
    term_weights = sorted([(t, w) for t, w in zip(terms, weights)], key=lambda row: -abs(row[1]))
    for term, wt in term_weights:
        if wt >= 0:
            d1.append((term, round(wt, 3)))
        else:
            d2.append((term, round(wt, 3)))

    print('Direction 1:', d1)
    print('-'*50)
    print('Direction 2:', d2)
    print('-'*50)
    print()

Topic #1:
Direction 1: [('state', 0.221), ('neuron', 0.169), ('image', 0.138), ('cell', 0.13), ('layer', 0.13), ('feature', 0.127), ('probability', 0.121), ('hidden', 0.114), ('distribution', 0.105), ('rate', 0.098), ('signal', 0.095), ('task', 0.093), ('class', 0.092), ('noise', 0.09), ('net', 0.089), ('recognition', 0.089), ('representation', 0.088), ('field', 0.082), ('rule', 0.082), ('step', 0.08)]
--------------------------------------------------
Direction 2: []
--------------------------------------------------

Topic #2:
Direction 1: [('cell', 0.417), ('neuron', 0.39), ('response', 0.175), ('stimulus', 0.155), ('visual', 0.131), ('spike', 0.13), ('firing', 0.117), ('synaptic', 0.11), ('activity', 0.104), ('cortex', 0.097), ('field', 0.085), ('frequency', 0.085), ('direction', 0.082), ('circuit', 0.082), ('motion', 0.082)]
--------------------------------------------------
Direction 2: [('state', -0.289), ('probability', -0.109), ('hidden', -0.098), ('class', -0.091), ('policy',

In [64]:
dt_df = pd.DataFrame(np.round(document_topics, 3), columns=['T'+str(i) for i in range(1, TOTAL_TOPICS+1)])
dt_df.T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1730,1731,1732,1733,1734,1735,1736,1737,1738,1739
T1,42.714,44.822,46.477,20.943,17.548,32.834,34.128,23.13,25.54,30.974,...,38.755,31.059,36.184,36.118,43.202,22.319,19.539,29.186,60.692,51.97
T2,17.77,6.649,-8.457,-5.107,-3.635,5.514,-0.007,-4.835,8.066,1.37,...,27.5,-8.146,-21.41,-12.73,-8.751,-7.387,-11.271,-9.614,-25.865,-47.552
T3,8.193,18.312,12.215,-4.495,-2.611,-5.521,18.723,-7.784,0.556,-15.393,...,-2.196,-3.255,34.469,-7.267,-39.015,-3.767,-4.953,-18.183,32.383,74.061
T4,-9.513,-13.303,-1.709,-9.058,-4.445,8.966,1.518,-5.905,-1.812,12.518,...,26.063,-7.138,16.593,-12.591,27.755,-6.691,-8.819,-1.926,21.709,38.714
T5,22.028,8.986,-10.586,7.615,2.215,-10.962,16.546,3.375,8.839,-12.691,...,-6.27,1.579,-1.06,-8.808,-18.84,-7.65,-7.03,-7.298,-3.606,-9.818
T6,-16.731,-20.833,-8.04,3.771,2.4,-14.726,-11.58,0.267,-4.417,-13.965,...,-2.659,4.988,6.211,4.504,-22.516,-0.643,0.352,0.723,12.865,1.262
T7,-13.66,2.279,-12.092,-8.912,-9.673,-9.546,-5.158,-5.809,4.53,-5.678,...,-12.561,-3.196,2.442,-1.409,12.851,-2.714,1.234,12.992,16.353,14.963
T8,-18.427,-9.052,4.081,-3.682,0.327,3.658,-13.321,4.941,12.25,3.065,...,13.246,-0.025,0.823,-5.276,-19.979,-4.121,1.603,-7.423,12.649,-16.091
T9,1.199,-5.04,6.762,-1.345,1.153,17.199,-5.996,-1.381,12.249,2.538,...,33.609,4.35,-6.278,2.288,0.588,7.977,4.682,-4.812,-16.777,-20.378
T10,5.723,7.673,2.857,6.169,-4.514,19.623,9.756,-1.09,1.13,4.474,...,12.802,-8.582,3.458,10.627,4.01,-5.835,-6.096,-11.09,18.002,17.049


In [65]:
document_numbers = [13, 250, 500]
for document_number in document_numbers:
    top_topics = list(dt_df.columns[np.argsort(-np.absolute(dt_df.iloc[document_number].values))[:3]])
    print('Document#'+str(document_number)+':')
    print('Dominant Topics (top 3): ', top_topics)
    print('Paper Summary: ')
    print(papers[document_number][:500])
    print()

Document#13:
Dominant Topics (top 3):  ['T1', 'T5', 'T9']
Paper Summary: 
534 
The Performance of Convex Set Projection Based Neural Networks 
Robert J. Marks II, Les E. Atlas, Seho Oh and James A. Ritcey 
Interactive Systems Design Lab, FT-10 
University of Washington, Seattle, Wa 98195. 
ABSTRACT 
We donsider a class of neural networks whose performance can be 
analyzed and geometrically visualized in a signal space 
environment. Alternating projection neural networks (APNN's) 
perform by alternately projecting between two or more constraint 
sets. Criteria for desi

Document#250:
Dominant Topics (top 3):  ['T1', 'T7', 'T13']
Paper Summary: 
558 Rohwer 
The 'Moving Targets' Training Algorithm 
Richard Rohwer 
Centre for Speech Technology Research 
Edinburgh University 
80, South Bridge 
Edinburgh EH1 1HN SCOTLAND 
ABSTRACT 
A simple method for training the dynamical behavior of a neu- 
ral network is derived. It is applicable to any training problem 
in discrete-time networks with ar