In [1]:
from typing import *
import pandas as pd

In [2]:
papers = pd.read_csv('data/nips/papers.csv')
papers = papers.drop(columns=['id', 'event_type', 'pdf_name', 'abstract'], axis=1)
papers.head()

Unnamed: 0,year,title,paper_text
0,1987,Self-Organization of Associative Database and ...,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,1987,A Mean Field Theory of Layer IV of Visual Cort...,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
2,1988,Storing Covariance by the Associative Long-Ter...,394\n\nSTORING COVARIANCE BY THE ASSOCIATIVE\n...
3,1994,Bayesian Query Construction for Neural Network...,Bayesian Query Construction for Neural\nNetwor...
4,1994,"Neural Network Ensembles, Cross Validation, an...","Neural Network Ensembles, Cross\nValidation, a..."


In [3]:
import re

def preprocess_text(text: str) -> str:
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'[\d_]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.strip()
    text = text.lower()
    return text

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words = set(map(preprocess_text, stop_words))
    
    
def text_to_words(text: str) -> List[str]:
    return text.split(" ")

def filter_stop_words(text: List[str]) -> List[str]:
    return [word for word in text if word not in stop_words]

import nltk
sno = nltk.stem.SnowballStemmer('english')

def stem_words(text: List[str]) -> List[str]:
    return [sno.stem(word) for word in text]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Maksim.Zuev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
preprocessed_texts = papers['paper_text'].map(preprocess_text).to_list()
preprocessed_documents = [text_to_words(document) for document in preprocessed_texts]
list(map(lambda t: ", ".join(t[:10]) + ", ...", preprocessed_documents[:5]))

['selforganization, of, associative, database, and, its, applications, hisashi, suzuki, and, ...',
 'a, mean, field, theory, of, layer, iv, of, visual, cortex, ...',
 'storing, covariance, by, the, associative, longterm, potentiation, and, depression, of, ...',
 'bayesian, query, construction, for, neural, network, models, gerhard, paass, jorg, ...',
 'neural, network, ensembles, cross, validation, and, active, learning, anders, krogh, ...']

In [5]:
filtered_documents = [filter_stop_words(document) for document in preprocessed_documents]
list(map(lambda t: ", ".join(t[:10]) + ", ...", filtered_documents[:5]))

['selforganization, associative, database, applications, hisashi, suzuki, suguru, arimoto, osaka, university, ...',
 'mean, field, theory, layer, iv, visual, cortex, application, artificial, neural, ...',
 'storing, covariance, associative, longterm, potentiation, depression, synaptic, strengths, hippocampus, patric, ...',
 'bayesian, query, construction, neural, network, models, gerhard, paass, jorg, kindermann, ...',
 'neural, network, ensembles, cross, validation, active, learning, anders, krogh, nordita, ...']

In [6]:
from wordcloud import WordCloud

# all_words = ' '.join([' '.join(text) for text in filtered_documents])
# wordcloud = WordCloud(background_color="white", max_words=100, contour_width=3, contour_color='steelblue')
# wordcloud.generate(all_words)
# wordcloud.to_image()

In [7]:
stemmed_documents = [stem_words(document) for document in filtered_documents]
list(map(lambda t: ", ".join(t[:10]) + ", ...", stemmed_documents[:5]))

['selforgan, associ, databas, applic, hisashi, suzuki, suguru, arimoto, osaka, univers, ...',
 'mean, field, theori, layer, iv, visual, cortex, applic, artifici, neural, ...',
 'store, covari, associ, longterm, potenti, depress, synapt, strength, hippocampus, patric, ...',
 'bayesian, queri, construct, neural, network, model, gerhard, paass, jorg, kindermann, ...',
 'neural, network, ensembl, cross, valid, activ, learn, ander, krogh, nordita, ...']

In [16]:
documents = stemmed_documents

In [17]:
import gensim.corpora as corpora
id2word = corpora.Dictionary(documents)

In [18]:
word_count = len(id2word.cfs)
print(f"word count = {word_count}")
documents_count = id2word.num_docs
print(f"documents count = {documents_count}")
topics_count = 20
print(f"topics count = {topics_count}")

word count = 249310
documents count = 7241
topics count = 20


In [11]:
word_count_in_doc = [id2word.doc2bow(doc) for doc in documents]
", ".join(map(lambda t: f"{id2word[t[0]]} -> {t[1]}", word_count_in_doc[0][:20]))

'abolish -> 1, abstract -> 1, acceler -> 1, accept -> 2, accomplish -> 1, accord -> 6, achiev -> 2, actual -> 3, adap -> 1, add -> 2, address -> 2, adjac -> 1, admitt -> 1, adopt -> 1, advanc -> 2, al -> 4, algorithm -> 8, almighti -> 1, alreadi -> 1, also -> 2'

In [12]:
import numpy as np

phi = np.random.rand(word_count, topics_count)
theta = np.random.rand(topics_count, documents_count)
n_word_topic = np.zeros((word_count, topics_count))
n_topic_document = np.zeros((topics_count, documents_count))

for _ in range(100):
    n_word_topic.fill(0)
    n_topic_document.fill(0)
    for d in range(documents_count):
        for (w, wc) in word_count_in_doc[d]:
            z = np.dot(phi[w, :], theta[:, d])
            for t in range(topics_count):
                k = phi[w, t] * theta[t, d]
                if k > 0:
                    delta = wc * k / z
                    n_word_topic[w, t] += delta
                    n_topic_document[t, d] += delta
    for t in range(topics_count):
        n_t = np.sum(n_word_topic[:, t])
        phi[:, t] += n_word_topic[:, t] / n_t
    for d in range(documents_count):
        n_d = np.sum(n_topic_document[:, d])
        theta[:, d] += n_topic_document[:, d] / n_d


In [15]:
for t in range(topics_count):
    top_words = np.argsort(phi[:, t])[::-1][:10]
    top_words = [id2word[i] for i in top_words]
    print(f"Top words for topic {t}: {', '.join(top_words)}")

Top words for topic 0: model, use, data, algorithm, time, system, e, function, one, figur
Top words for topic 1: use, x, function, j, input, process, problem, differ, k, system
Top words for topic 2: model, network, use, algorithm, output, n, case, problem, condit, class
Top words for topic 3: use, e, input, network, c, function, p, valu, comput, one
Top words for topic 4: learn, use, k, estim, l, figur, inform, problem, b, number
Top words for topic 5: network, use, learn, function, algorithm, input, perform, p, system, model
Top words for topic 6: use, n, j, e, figur, data, neural, state, vector, one
Top words for topic 7: use, comput, unit, algorithm, neural, valu, n, learn, process, experi
Top words for topic 8: set, model, n, data, problem, f, result, paramet, algorithm, x
Top words for topic 9: learn, train, input, data, model, j, use, valu, r, differ
Top words for topic 10: use, x, train, result, two, method, valu, one, function, problem
Top words for topic 11: x, input, set, re