In [1]:
import pandas as pd
import re
import numpy as np
from time import time
import string
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.cross_validation import cross_val_score
#from nltk import FreqDist
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
import gensim
from gensim import corpora
from gensim import models
from gensim.corpora.dictionary import Dictionary
import logging
logging.basicConfig(format='%(levelname)s : %(message)s', level=logging.INFO)

[nltk_data] Downloading package stopwords to
[nltk_data]     D:\userdata\lianos\Application Data\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!




In [2]:
en_stop = set(stopwords.words("english"))
digits = ['one','two','three','four','five','six','seven','eight','nine','ten','computer', 'will', 'develop', 'development',
                            'project', 'research', 'new', 'use', 
                            'europe', 'european','would','one','i']
for x in digits:
    en_stop.add(str(x))

In [3]:
list=["biology","cooking","crypto","diy","robotics","travel"]

Read each csv file individually and return dataframes cleaned up (no NA values, no symbols such as new paragraph or new line) 
Test dataframe is the Psysics subject

In [4]:
def get_df(x):
    path = 'stackexchange/{}.csv'.format(str(x))
    dataframe = pd.read_csv(path,usecols=['title','content','tags'],index_col=0)
    dataframe = dataframe.reset_index()
    dataframe.dropna()
    dataframe = dataframe.replace(re.compile('<.*?>'), ' ')
    dataframe = dataframe.replace(re.compile('\n'), ' ')
    return dataframe

biology = get_df('biology')
cooking = get_df('cooking')
crypto = get_df('crypto')
diy = get_df('diy')
robotics = get_df('robotics')
travel = get_df('travel')
test = get_df('test')   #physics set

Make a new dataframe called training_set consisting of the training set data. (all datasets concatenated)

In [5]:
training_list=[]
for x in list:
    path = 'stackexchange/{}.csv'.format(str(x))
    training_list.append(pd.read_csv(path,usecols=['title','content','tags'],index_col=0))
complete_set = pd.concat(training_list, axis=0 , join='outer')
complete_set.dropna()
complete_set= complete_set.reset_index()
complete_set.head(10)
complete_set = complete_set.replace(re.compile('<.*?>'), ' ')
complete_set = complete_set.replace(re.compile('\n'), ' ')

In [6]:
complete_set.shape

(87000, 3)

In [7]:
print(complete_set['tags'].values)

['ribosome binding-sites translation synthetic-biology' 'rna biochemistry'
 'immunology cell-biology hematology' ...,
 'customs-and-immigration officials registration macedonia' 'visas austria'
 'untagged']


Combine all columns of complete_set to a 3-tuple 

In [8]:
subset = complete_set[['title','content','tags']]
tuples = [tuple(x) for x in subset.values]
print(len(tuples))

87000


In [9]:
test.shape

(81926, 3)

In [10]:
test.head(20)

Unnamed: 0,id,title,content
0,What is spin as it relates to subatomic partic...,I often hear about subatomic particles having...,
1,What is your simplest explanation of the strin...,How would you explain string theory to non ph...,
2,"Lie theory, Representations and particle physics",This is a question that has been posted at ma...,
3,Will Determinism be ever possible?,What are the main problems that we need to so...,
4,Hamilton's Principle,Hamilton's principle states that a dynamic sy...,
5,What is sound and how is it produced?,"I've been using the term ""sound"" all my life,...",
6,What experiment would disprove string theory?,I know that there's big controversy between t...,
7,Why does the sky change color? Why the sky is ...,Why does the sky change color? Why the sky is...,
8,How's the energy of particle collisions calcul...,Physicists often refer to the energy of colli...,
9,Monte Carlo use,Where is the Monte Carlo method used in physi...,


In [11]:
print(' Cooking: ')
print(cooking.shape)
print(' Biology: ')
print(biology.shape)
print(' DIY: ')
print(diy.shape)
print(' Robotics: ')
print(robotics.shape)
print(' Travel: ')
print(travel.shape)
print(' Crypto: ')
print(crypto.shape)


 Cooking: 
(15404, 3)
 Biology: 
(13196, 3)
 DIY: 
(25918, 3)
 Robotics: 
(2771, 3)
 Travel: 
(19279, 3)
 Crypto: 
(10432, 3)


Get the titles from the complete set

In [12]:
titles_only=complete_set['title']
titles_only.shape

(87000,)

Let's get back to the test dataframe (consisting of physics-related stackexchange threads) and see if we can extract some common tags 

In [13]:
test.head(10)


Unnamed: 0,id,title,content
0,What is spin as it relates to subatomic partic...,I often hear about subatomic particles having...,
1,What is your simplest explanation of the strin...,How would you explain string theory to non ph...,
2,"Lie theory, Representations and particle physics",This is a question that has been posted at ma...,
3,Will Determinism be ever possible?,What are the main problems that we need to so...,
4,Hamilton's Principle,Hamilton's principle states that a dynamic sy...,
5,What is sound and how is it produced?,"I've been using the term ""sound"" all my life,...",
6,What experiment would disprove string theory?,I know that there's big controversy between t...,
7,Why does the sky change color? Why the sky is ...,Why does the sky change color? Why the sky is...,
8,How's the energy of particle collisions calcul...,Physicists often refer to the energy of colli...,
9,Monte Carlo use,Where is the Monte Carlo method used in physi...,


In [14]:
test['title'].shape

(81926,)

In [15]:
test_objectives= test['title'].str.lower().str.replace('|'.join([re.escape(x) for x in string.punctuation]), ' ')
test_objectives_split=test_objectives.str.split()
test_objectives_split.head(20)

0     [i, often, hear, about, subatomic, particles, ...
1     [how, would, you, explain, string, theory, to,...
2     [this, is, a, question, that, has, been, poste...
3     [what, are, the, main, problems, that, we, nee...
4     [hamilton, s, principle, states, that, a, dyna...
5     [i, ve, been, using, the, term, sound, all, my...
6     [i, know, that, there, s, big, controversy, be...
7     [why, does, the, sky, change, color, why, the,...
8     [physicists, often, refer, to, the, energy, of...
9     [where, is, the, monte, carlo, method, used, i...
10    [i, think, it, s, clear, enough, that, if, you...
11    [i, am, wondering, if, someone, could, provide...
12    [we, ve, learned, that, the, wave, function, o...
13    [i, recently, encountered, a, puzzle, where, a...
14    [what, is, einstein, s, theory, of, special, r...
15    [there, is, a, common, myth, that, water, flow...
16    [if, i, separate, two, magnets, whose, opposit...
17    [physicists, studying, the, grounds, of, p

In [16]:
#stopwords_current = set(en_stop) | additional_stopwords
test_objectives_split= test_objectives_split.apply(lambda tokens: [token for token in tokens if token not in en_stop])

Now we construct a dictionary from the above using gensim lib

In [17]:
objectives_dictionary = Dictionary(test_objectives_split)

INFO : adding document #0 to Dictionary(0 unique tokens: [])
INFO : adding document #10000 to Dictionary(27610 unique tokens: ['practicable', 'dover', 'planet', 'outdoor', 'krauss']...)
INFO : adding document #20000 to Dictionary(37860 unique tokens: ['hookean', 'practicable', 'dover', 'planet', 'outdoor']...)
INFO : adding document #30000 to Dictionary(46292 unique tokens: ['practicable', 'disposing', 'finnish', '1101001', 'montonen']...)
INFO : adding document #40000 to Dictionary(53386 unique tokens: ['d’s', 'practicable', 'disposing', 'finnish', 'centrialfugal']...)
INFO : adding document #50000 to Dictionary(59437 unique tokens: ['d’s', 'practicable', 'parahydrogen', 'disposing', 'finnish']...)
INFO : adding document #60000 to Dictionary(65336 unique tokens: ['d’s', 'practicable', 'parahydrogen', 'disposing', 'finnish']...)
INFO : adding document #70000 to Dictionary(71351 unique tokens: ['d’s', 'practicable', 'parahydrogen', 'disposing', 'untapped']...)
INFO : adding document #80

Now the corpus

In [18]:
class ObjectivesCorpus(object):
    def __init__(self, documents, dictionary):
        self.documents = documents
        self.dictionary = dictionary
    def __iter__(self):
        for document in self.documents:
            yield self.dictionary.doc2bow(document)
            
objective_corpus = ObjectivesCorpus(test_objectives_split,objectives_dictionary)

Run the LDA using gensim lib

In [19]:
t0 = time()
lda = gensim.models.ldamulticore.LdaMulticore(corpus=objective_corpus, 
                                              id2word=objectives_dictionary, 
                                              num_topics=10, 
                                              iterations=50,
                                              passes=5)
print("done in %0.3fs." % (time() - t0))

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 1.2819198030971182e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 5 passes over the supplied corpus of 81926 documents, updating every 6000 documents, evaluating every ~60000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 3 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/81926, outstanding queue size 1
INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/81926, outstanding queue size 2
INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/81926, outstanding queue size 3
INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/81926, outstanding queue size 4
INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to #10000/81926, outstanding queue size 5
INFO : PROGRESS: pass 0, dispatched chunk #5 = documents up to #12000/81926, outstanding queue

done in 361.287s.


In [20]:
lda.print_topics(10)

INFO : topic #0 (0.100): 0.012*"quantum" + 0.009*"physics" + 0.008*"question" + 0.007*"theory" + 0.007*"time" + 0.007*"like" + 0.006*"know" + 0.006*"mechanics" + 0.006*"system" + 0.005*"understand"
INFO : topic #1 (0.100): 0.042*"field" + 0.017*"magnetic" + 0.017*"charge" + 0.014*"electric" + 0.008*"point" + 0.008*"force" + 0.008*"current" + 0.008*"space" + 0.007*"fields" + 0.006*"potential"
INFO : topic #2 (0.100): 0.049*"mu" + 0.044*"partial" + 0.022*"nu" + 0.022*"phi" + 0.021*"gamma" + 0.019*"x" + 0.019*"alpha" + 0.018*"g" + 0.017*"frac" + 0.014*"lambda"
INFO : topic #3 (0.100): 0.022*"water" + 0.021*"temperature" + 0.018*"pressure" + 0.014*"heat" + 0.012*"air" + 0.010*"gas" + 0.009*"energy" + 0.007*"entropy" + 0.007*"system" + 0.006*"flow"
INFO : topic #4 (0.100): 0.028*"energy" + 0.020*"electron" + 0.015*"electrons" + 0.009*"e" + 0.009*"state" + 0.008*"current" + 0.007*"atom" + 0.007*"voltage" + 0.007*"spin" + 0.006*"potential"
INFO : topic #5 (0.100): 0.021*"force" + 0.014*"veloc

[(0,
  '0.012*"quantum" + 0.009*"physics" + 0.008*"question" + 0.007*"theory" + 0.007*"time" + 0.007*"like" + 0.006*"know" + 0.006*"mechanics" + 0.006*"system" + 0.005*"understand"'),
 (1,
  '0.042*"field" + 0.017*"magnetic" + 0.017*"charge" + 0.014*"electric" + 0.008*"point" + 0.008*"force" + 0.008*"current" + 0.008*"space" + 0.007*"fields" + 0.006*"potential"'),
 (2,
  '0.049*"mu" + 0.044*"partial" + 0.022*"nu" + 0.022*"phi" + 0.021*"gamma" + 0.019*"x" + 0.019*"alpha" + 0.018*"g" + 0.017*"frac" + 0.014*"lambda"'),
 (3,
  '0.022*"water" + 0.021*"temperature" + 0.018*"pressure" + 0.014*"heat" + 0.012*"air" + 0.010*"gas" + 0.009*"energy" + 0.007*"entropy" + 0.007*"system" + 0.006*"flow"'),
 (4,
  '0.028*"energy" + 0.020*"electron" + 0.015*"electrons" + 0.009*"e" + 0.009*"state" + 0.008*"current" + 0.007*"atom" + 0.007*"voltage" + 0.007*"spin" + 0.006*"potential"'),
 (5,
  '0.021*"force" + 0.014*"velocity" + 0.012*"mass" + 0.007*"acceleration" + 0.007*"object" + 0.007*"0" + 0.006*"f" + 0

LDA with TF-IDF

In [21]:
tf_idf = models.TfidfModel(objective_corpus)
tf_idf_corpus = tf_idf[objective_corpus]


INFO : collecting document frequencies
INFO : PROGRESS: processing document #0
INFO : PROGRESS: processing document #10000
INFO : PROGRESS: processing document #20000
INFO : PROGRESS: processing document #30000
INFO : PROGRESS: processing document #40000
INFO : PROGRESS: processing document #50000
INFO : PROGRESS: processing document #60000
INFO : PROGRESS: processing document #70000
INFO : PROGRESS: processing document #80000
INFO : calculating IDF weights for 81926 documents and 78007 features (3787773 matrix non-zeros)


In [22]:
t0 = time()
lda_tf_idf = gensim.models.ldamulticore.LdaMulticore(corpus=objective_corpus, 
                                                     id2word=objectives_dictionary, 
                                                     num_topics=10, 
                                                     iterations=50,
                                                     passes=5)
print("done in %0.3fs." % (time() - t0))

INFO : using symmetric alpha at 0.1
INFO : using symmetric eta at 1.2819198030971182e-05
INFO : using serial LDA version on this node
INFO : running online LDA training, 10 topics, 5 passes over the supplied corpus of 81926 documents, updating every 6000 documents, evaluating every ~60000 documents, iterating 50x with a convergence threshold of 0.001000
INFO : training LDA model using 3 processes
INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2000/81926, outstanding queue size 1
INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #4000/81926, outstanding queue size 2
INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #6000/81926, outstanding queue size 3
INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #8000/81926, outstanding queue size 4
INFO : PROGRESS: pass 0, dispatched chunk #4 = documents up to #10000/81926, outstanding queue size 5
INFO : PROGRESS: pass 0, dispatched chunk #5 = documents up to #12000/81926, outstanding queue

done in 396.836s.


In [23]:
lda_tf_idf.print_topics(20)

INFO : topic #0 (0.100): 0.020*"light" + 0.015*"time" + 0.015*"energy" + 0.009*"speed" + 0.008*"wave" + 0.008*"question" + 0.007*"space" + 0.006*"photon" + 0.006*"waves" + 0.006*"like"
INFO : topic #1 (0.100): 0.021*"temperature" + 0.015*"heat" + 0.012*"pressure" + 0.011*"energy" + 0.011*"gas" + 0.009*"water" + 0.006*"volume" + 0.005*"liquid" + 0.005*"system" + 0.004*"change"
INFO : topic #2 (0.100): 0.059*"2" + 0.052*"1" + 0.028*"mu" + 0.026*"frac" + 0.023*"n" + 0.020*"0" + 0.016*"e" + 0.014*"k" + 0.014*"alpha" + 0.013*"j"
INFO : topic #3 (0.100): 0.026*"2" + 0.022*"force" + 0.017*"1" + 0.016*"mass" + 0.015*"velocity" + 0.013*"v" + 0.011*"0" + 0.011*"f" + 0.008*"acceleration" + 0.006*"10"
INFO : topic #4 (0.100): 0.026*"black" + 0.021*"hole" + 0.013*"universe" + 0.010*"mass" + 0.010*"space" + 0.010*"time" + 0.009*"star" + 0.008*"spacetime" + 0.008*"horizon" + 0.008*"holes"
INFO : topic #5 (0.100): 0.028*"field" + 0.018*"charge" + 0.017*"magnetic" + 0.014*"electric" + 0.011*"current" +

[(0,
  '0.020*"light" + 0.015*"time" + 0.015*"energy" + 0.009*"speed" + 0.008*"wave" + 0.008*"question" + 0.007*"space" + 0.006*"photon" + 0.006*"waves" + 0.006*"like"'),
 (1,
  '0.021*"temperature" + 0.015*"heat" + 0.012*"pressure" + 0.011*"energy" + 0.011*"gas" + 0.009*"water" + 0.006*"volume" + 0.005*"liquid" + 0.005*"system" + 0.004*"change"'),
 (2,
  '0.059*"2" + 0.052*"1" + 0.028*"mu" + 0.026*"frac" + 0.023*"n" + 0.020*"0" + 0.016*"e" + 0.014*"k" + 0.014*"alpha" + 0.013*"j"'),
 (3,
  '0.026*"2" + 0.022*"force" + 0.017*"1" + 0.016*"mass" + 0.015*"velocity" + 0.013*"v" + 0.011*"0" + 0.011*"f" + 0.008*"acceleration" + 0.006*"10"'),
 (4,
  '0.026*"black" + 0.021*"hole" + 0.013*"universe" + 0.010*"mass" + 0.010*"space" + 0.010*"time" + 0.009*"star" + 0.008*"spacetime" + 0.008*"horizon" + 0.008*"holes"'),
 (5,
  '0.028*"field" + 0.018*"charge" + 0.017*"magnetic" + 0.014*"electric" + 0.011*"current" + 0.008*"electrons" + 0.007*"surface" + 0.006*"wire" + 0.006*"charges" + 0.006*"directio

Now time for some LDA using scikit learn

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
n_topics = 10


In [25]:
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2,
                                stop_words='english')
t0 = time()
tf = tf_vectorizer.fit_transform(test['title'])
print("done in %0.3fs." % (time() - t0))

done in 7.700s.


In [26]:
tf.shape

(81926, 37390)

In [27]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online',
                                random_state=0,
                                n_jobs=-1)

t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

done in 312.321s.


In [28]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        topic_sum = topic.sum()
        print("Topic #%d:" % topic_idx)
        print(" + ".join(["%0.3f*'%s'" % (topic[i] / topic_sum, feature_names[i])
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()
    


In [29]:
print("Topics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words=10)

Topics in LDA model:
Topic #0:
0.047*'frac' + 0.027*'mu' + 0.023*'partial' + 0.023*'right' + 0.023*'vec' + 0.023*'left' + 0.021*'amp' + 0.021*'rangle' + 0.019*'phi' + 0.018*'mathbf'
Topic #1:
0.034*'frac' + 0.015*'equation' + 0.014*'theta' + 0.013*'mathrm' + 0.013*'velocity' + 0.011*'10' + 0.010*'constant' + 0.010*'problem' + 0.009*'text' + 0.009*'frame'
Topic #2:
0.041*'force' + 0.030*'mass' + 0.026*'energy' + 0.018*'object' + 0.016*'earth' + 0.014*'black' + 0.014*'gravity' + 0.013*'speed' + 0.012*'gravitational' + 0.012*'hole'
Topic #3:
0.026*'water' + 0.025*'temperature' + 0.022*'pressure' + 0.022*'energy' + 0.018*'air' + 0.017*'heat' + 0.013*'gas' + 0.011*'flow' + 0.008*'fluid' + 0.008*'liquid'
Topic #4:
0.035*'space' + 0.027*'time' + 0.011*'theory' + 0.009*'relativity' + 0.009*'general' + 0.008*'number' + 0.008*'equations' + 0.008*'point' + 0.007*'information' + 0.007*'spacetime'
Topic #5:
0.016*'question' + 0.013*'like' + 0.013*'know' + 0.011*'just' + 0.011*'physics' + 0.008*'und