In [18]:
%load_ext autoreload
%autoreload 2
from config import CONFIG

import sys
import random
import os

import numpy as np
import pandas as pd
from pathlib import Path

sys.path.append(str(CONFIG.src_path / 'data' / 'sequential'))

from proxyManager import ProxyManager



The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
folders = (CONFIG.data_path / 'external' / 'nipstxt').glob('nips*')
papers = []

for folder in folders:
    files = folder.glob('*')
    for file in files:
        with open(str(file), encoding='utf8', errors='ignore', mode='r+') as f:
            data = f.read()
        papers.append(data)

len(papers)

1740

In [4]:
print(papers[0][:1000])

1 
CONNECTIVITY VERSUS ENTROPY 
Yaser S. Abu-Mostafa 
California Institute of Technology 
Pasadena, CA 91125 
ABSTRACT 
How does the connectivity of a neural network (number of synapses per 
neuron) relate to the complexity of the problems it can handle (measured by 
the entropy)? Switching theory would suggest no relation at all, since all Boolean 
functions can be implemented using a circuit with very low connectivity (e.g., 
using two-input NAND gates). However, for a network that learns a problem 
from examples using a local learning rule, we prove that the entropy of the 
problem becomes a lower bound for the connectivity of the network. 
INTRODUCTION 
The most distinguishing feature of neural networks is their ability to spon- 
taneously learn the desired function from 'training' samples, i.e., their ability 
to program themselves. Clearly, a given neural network cannot just learn any 
function, there must be some restrictions on which networks can learn which 
functions. One obv

## Basic Text Wrangling

In [5]:
%%time
import nltk
nltk.download('wordnet')
stop_words = nltk.corpus.stopwords.words('english')
wtk = nltk.tokenize.RegexpTokenizer(r'\w+')
wnl = nltk.stem.wordnet.WordNetLemmatizer()

# def normalize_corpus(papers):
#     norm_papers = []
#     for paper in papers:
#         paper = paper.lower()
#         paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
#         paper_tokens = [wln.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
#         paper_tokens = [token for token in paper_tokens if len(token) > 1]
#         paper_tokens = [token for token in paper_tokens if token not in stop_words]
#         paper_tokens = list(filter(None, paper_tokens))
#         if paper_tokens:
#             norm_papers.append(papers)
#     return norm_papers
def normalize_corpus(papers):
    norm_papers = []
    for paper in papers:
        paper = paper.lower()
        paper_tokens = [token.strip() for token in wtk.tokenize(paper)]
        paper_tokens = [wnl.lemmatize(token) for token in paper_tokens if not token.isnumeric()]
        paper_tokens = [token for token in paper_tokens if len(token) > 1]
        paper_tokens = [token for token in paper_tokens if token not in stop_words]
        paper_tokens = list(filter(None, paper_tokens))
        if paper_tokens:
            norm_papers.append(paper_tokens)
    return norm_papers
norm_papers = normalize_corpus(papers)
print(len(norm_papers))

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/tnguyen2921/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
1740
CPU times: user 32.6 s, sys: 37.7 ms, total: 32.7 s
Wall time: 33.6 s


In [6]:
norm_papers[0][:10]

['connectivity',
 'versus',
 'entropy',
 'yaser',
 'abu',
 'mostafa',
 'california',
 'institute',
 'technology',
 'pasadena']

In [7]:
import gensim

bigram = gensim.models.Phrases(norm_papers, min_count=20, threshold=20, delimiter=b"_")

bigram_model = gensim.models.phrases.Phraser(bigram)

bigram_model[norm_papers[0]][:10]

['connectivity',
 'versus',
 'entropy',
 'yaser',
 'abu_mostafa',
 'california_institute',
 'technology_pasadena',
 'ca_abstract',
 'doe',
 'connectivity']

In [8]:
norm_corpus_bigrams = [bigram_model[doc] for doc in norm_papers]

dictionary = gensim.corpora.Dictionary(norm_corpus_bigrams)
print("Sample word to number mappings:", list(dictionary.items())[:15])
print("Total Vocabulary Size:", len(dictionary))



Sample word to number mappings: [(0, '0a'), (1, '2h'), (2, '2h2'), (3, '2he'), (4, '2n'), (5, '__c'), (6, '_c'), (7, '_k'), (8, 'a2'), (9, 'ability'), (10, 'abu_mostafa'), (11, 'access'), (12, 'accommodate'), (13, 'according'), (14, 'accumulated')]
Total Vocabulary Size: 78892


In [9]:
dictionary.filter_extremes(no_below=20, no_above=0.6)
print("Total Vocabulary Size:", len(dictionary))

Total Vocabulary Size: 7756


In [10]:
bow_corpus = [dictionary.doc2bow(text) for text in norm_corpus_bigrams]
print(bow_corpus[1][:30])

[(3, 1), (12, 3), (14, 1), (15, 1), (16, 1), (17, 16), (20, 1), (24, 1), (26, 1), (31, 3), (35, 1), (36, 1), (40, 3), (41, 5), (42, 1), (48, 1), (53, 3), (55, 1), (56, 2), (58, 1), (60, 3), (63, 5), (64, 4), (65, 2), (73, 1), (74, 1), (75, 1), (76, 1), (77, 3), (82, 1)]


In [11]:
print([(dictionary[idx] , freq) for idx, freq in bow_corpus[1][:50]])

[('ability', 1), ('aip', 3), ('although', 1), ('american_institute', 1), ('amount', 1), ('analog', 16), ('appears', 1), ('architecture', 1), ('aspect', 1), ('available', 3), ('become', 1), ('becomes', 1), ('binary', 3), ('biological', 5), ('bit', 1), ('cannot', 1), ('circuit', 3), ('collective', 1), ('compare', 2), ('complex', 1), ('computing', 3), ('conference', 5), ('connected', 4), ('connectivity', 2), ('define', 1), ('defined', 1), ('defines', 1), ('definition', 1), ('denker', 3), ('designed', 1), ('desired', 4), ('diagonal', 1), ('difference', 1), ('directly', 2), ('ed', 1), ('el', 2), ('element', 3), ('equivalent', 1), ('eventually', 1), ('feature', 2), ('final', 4), ('find', 2), ('fixed', 2), ('frequency', 1), ('furthermore', 1), ('generating', 1), ('get', 1), ('global', 6), ('go', 1), ('hence', 1)]


In [12]:
print('Total number of papers:', len(bow_corpus))

Total number of papers: 1740


In [None]:
%%time

TOTAL_TOPICS = 10

lsi_bow = gensim.models.LsiModel(bow_corpus, id2word=dictionay, num_topics=TOTAL_TOPICS, onepass=True, chunksize=1740, power_iters=1000)



### LDA

In [12]:
%%time
TOTAL_TOPICS = 10
lda_model = gensim.models.LdaModel(corpus=bow_corpus, id2word=dictionary, chunksize=1740, alpha="auto", eta="auto", random_state=42, iterations=500, num_topics=TOTAL_TOPICS, passes=20, eval_every=None)


CPU times: user 8min 20s, sys: 19.2 s, total: 8min 39s
Wall time: 2min 28s


In [15]:
for topic_id, topic in lda_model.print_topics(num_topics=10, num_words=20):
    print('Topic #'+str(topic_id+1)+':')
    print(topic)
    print()

Topic #1:
0.013*"circuit" + 0.012*"chip" + 0.008*"neuron" + 0.008*"analog" + 0.007*"current" + 0.007*"bit" + 0.006*"voltage" + 0.005*"node" + 0.005*"word" + 0.005*"vector" + 0.005*"processor" + 0.004*"implementation" + 0.004*"threshold" + 0.004*"computation" + 0.004*"element" + 0.004*"signal" + 0.004*"pattern" + 0.004*"design" + 0.004*"memory" + 0.004*"parallel"

Topic #2:
0.030*"image" + 0.012*"object" + 0.011*"feature" + 0.006*"pixel" + 0.006*"visual" + 0.005*"representation" + 0.005*"recognition" + 0.005*"unit" + 0.005*"motion" + 0.005*"face" + 0.005*"task" + 0.004*"view" + 0.004*"layer" + 0.004*"human" + 0.004*"training" + 0.004*"position" + 0.004*"location" + 0.004*"region" + 0.004*"character" + 0.003*"vector"

Topic #3:
0.020*"neuron" + 0.017*"cell" + 0.012*"response" + 0.010*"stimulus" + 0.007*"spike" + 0.007*"signal" + 0.006*"activity" + 0.006*"synaptic" + 0.005*"firing" + 0.005*"frequency" + 0.005*"pattern" + 0.004*"current" + 0.004*"effect" + 0.004*"neural" + 0.004*"change" +

In [16]:
cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, texts=norm_corpus_bigrams, dictionary=dictionary, coherence='c_v')

avg_coherence_cv = cv_coherence_model_lda.get_coherence()

umass_coherernce_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, texts=norm_corpus_bigrams, dictionary=dictionary, coherence='u_mass')

avg_coherence_umass = umass_coherernce_model_lda.get_coherence()

perplexity = lda_model.log_perplexity(bow_corpus)
print('Avg. Coherence Score (Cv):', avg_coherence_cv)
print('Avg. Coherence Score (UMass):', avg_coherence_umass)
print('Model Perplexity:', perplexity)

Avg. Coherence Score (Cv): 0.4930044902277785
Avg. Coherence Score (UMass): -0.9858031202745918
Model Perplexity: -7.787864315295923


In [21]:
MALLET_PATH

PosixPath('/mnt/d/personal_projects/JobPostsAnalysis/JobPostsAnalysis/models/mallet-2.0.8/bin/mallet')

In [22]:
from tqdm import tqdm

def topic_model_coherernce_generator(corpus, texts, dictionary, start_topic_count=2, end_topic_count=10, step=1, cpus=1):
    models = []
    coherence_scores = []
    for topic_nums in tqdm(range(start_topic_count, end_topic_count+1, step)):
        lda_model = gensim.models.LdaModel(corpus=corpus, id2word=dictionary, chunksize=1740, alpha="auto", eta="auto", random_state=42, iterations=500, num_topics=topic_nums, passes=20, eval_every=None)
        cv_coherence_model_lda = gensim.models.CoherenceModel(model=lda_model, corpus=bow_corpus, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_score = cv_coherence_model_lda.get_coherence()
        coherence_scores.append(coherence_score)
        models.append(lda_model)
    return models, coherence_scores


In [23]:
lda_models, coherence_scores = topic_model_coherernce_generator(corpus=bow_corpus, texts=norm_corpus_bigrams, dictionary=dictionary, start_topic_count=2, end_topic_count=30, step=1, cpus=7)

100%|██████████| 29/29 [1:12:39<00:00, 150.33s/it]


In [13]:
topics_coherences = lda_model.top_topics(bow_corpus, topn=20)
avg_coherence_score = np.mean([item[1] for item in topics_coherences])

print("Avg. Coherence Score:", avg_coherence_score)

Avg. Coherence Score: -0.9858031202745918


In [None]:
topics_with_wts = [item[0] for item in topics_coherences]

print("LDA topics with weights")
print("="*50)
for idx, topic in enumerate(topics_with_wts):
    print("Topic #" + str(idx + 1) + ":")
    print([(term, round(wt, 3)) for wt, term in topic])