In [34]:
import pandas as pd
import gensim
from nltk.stem.porter import PorterStemmer

In [3]:
df = pd.read_csv("data/Papers.csv")

# Exploration of reviews

In [4]:
df.describe()

Unnamed: 0,Id
count,403.0
mean,5834.0
std,116.480327
min,5633.0
25%,5733.5
50%,5834.0
75%,5934.5
max,6035.0


In [5]:
df.head()

Unnamed: 0,Id,Title,EventType,PdfName,Abstract,PaperText
0,5677,Double or Nothing: Multiplicative Incentive Me...,Poster,5677-double-or-nothing-multiplicative-incentiv...,Crowdsourcing has gained immense popularity in...,Double or Nothing: Multiplicative\nIncentive M...
1,5941,Learning with Symmetric Label Noise: The Impor...,Spotlight,5941-learning-with-symmetric-label-noise-the-i...,Convex potential minimisation is the de facto ...,Learning with Symmetric Label Noise: The\nImpo...
2,6019,Algorithmic Stability and Uniform Generalization,Poster,6019-algorithmic-stability-and-uniform-general...,One of the central questions in statistical le...,Algorithmic Stability and Uniform Generalizati...
3,6035,Adaptive Low-Complexity Sequential Inference f...,Poster,6035-adaptive-low-complexity-sequential-infere...,We develop a sequential low-complexity inferen...,Adaptive Low-Complexity Sequential Inference f...
4,5978,Covariance-Controlled Adaptive Langevin Thermo...,Poster,5978-covariance-controlled-adaptive-langevin-t...,Monte Carlo sampling for Bayesian posterior in...,Covariance-Controlled Adaptive Langevin\nTherm...


In [6]:
df["PaperText"].str.len().describe()

count      403.000000
mean     33776.503722
std       3266.834148
min      23590.000000
25%      31469.000000
50%      33990.000000
75%      36009.500000
max      42533.000000
Name: PaperText, dtype: float64

# Basic gensim topic modeling

## Create a 2d array with the words in each document

In [128]:
documents_clean = df["Abstract"].str.replace("\.|\?|-|!|'|,|:|\+|-|=|(|)", "")
stoplist = list(gensim.parsing.preprocessing.STOPWORDS) + \
            ["hes", "think", "weve", "thats", "said", "want", "look", 
             "youve", "youre", "model", "models", "method", "matrix"] + \
            list(range(0,9))
porter_stemmer = PorterStemmer()
texts = [[word for word in document.lower().split() if word not in stoplist and len(word) > 2] for document in documents_clean]

## Remove words that only appear once

In [129]:
from collections import defaultdict
frequency =  defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 1] for text in texts]

## Generate a dictionary (word to id mapping)

In [130]:
dictionary = gensim.corpora.Dictionary(texts)
print(dictionary)

Dictionary(3104 unique tokens: ['crowdsourcing', 'gained', 'popularity', 'machine', 'learning']...)


In [131]:
dictionary.filter_n_most_frequent(50)

## Generate the bag of words corpus

In [132]:
corpus = [dictionary.doc2bow(text) for text in texts]

## LDA Model

In [133]:
lda = gensim.models.LdaModel(corpus, id2word=dictionary, num_topics=10)

In [134]:
top_words = [[word for word, _ in lda.show_topic(topicno, topn=50)] for topicno in range(lda.num_topics)]

In [135]:
for topics,i in zip(top_words, range(0,len(top_words))):
    print(str(i) + ": " + " ".join(topics[:5]))

0: gradient approximation bounds regret known
1: information space visual previous computational
2: output bounds search functions gradient
3: graph bounds tensor causal bound
4: estimators statistical high deep network
5: sparse bounds deep estimation convex
6: network guarantees case convolutional constraints
7: input network decision approaches classification
8: setting space feature bounds functions
9: image prediction questions network random


In [136]:
for i in range(0,100):
    doc = i
    print(lda.get_document_topics(corpus[doc]))
    print(df.loc[doc]["Title"])
    print("----------------------------------------")

[(3, 0.9844801406773237)]
Double or Nothing: Multiplicative Incentive Mechanisms for Crowdsourcing
----------------------------------------
[(9, 0.98420808514939861)]
Learning with Symmetric Label Noise: The Importance of Being Unhinged
----------------------------------------
[(8, 0.98860588683921413)]
Algorithmic Stability and Uniform Generalization
----------------------------------------
[(3, 0.98333043007784415)]
Adaptive Low-Complexity Sequential Inference for Dirichlet Process Mixture Models
----------------------------------------
[(7, 0.98363189291490438)]
Covariance-Controlled Adaptive Langevin Thermostat for Large-Scale Bayesian Sampling
----------------------------------------
[(4, 0.98448015154219104)]
Robust Portfolio Optimization
----------------------------------------
[(7, 0.98732203089806936)]
Logarithmic Time Online Multiclass prediction
----------------------------------------
[(0, 0.9678500381385039)]
Planar Ultrametrics for Image Segmentation
---------------------