In [1]:
import pandas as pd
import numpy as np

# Split Data

In [2]:
datatidy = pd.DataFrame.from_csv('datatidy2.csv')
tidytext = datatidy['TidiedText']

In [3]:
def split_document(doc):
    n = len(doc)
    idx1 = n//2
    idx1 = doc[:idx1].rfind(' ')
    return doc[:idx1], doc[idx1:]

In [4]:
tidy1 = []
tidy2 = []
for doc in tidytext:
    firsthalf, secondhalf = split_document(doc)
    tidy1.append(firsthalf)
    tidy2.append(secondhalf)

# LDA

## Preliminaries

In [5]:
from time import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [49]:
n_features = 1000
n_topics = 50
n_top_words = 20

In [50]:
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features)
t0 = time()
tf_vectorizer.fit(tidytext)
print("done in %0.3fs." % (time() - t0))

Extracting tf features for LDA...
done in 6.323s.


## Fit All

In [51]:
tfall = tf_vectorizer.transform(tidytext)

In [52]:
print("Fitting LDA models with tf features and n_features=%d..."
      % (n_features))
ldaall = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
ldaall.fit(tfall)
print("done in %0.3fs." % (time() - t0))


Fitting LDA models with tf features and n_features=1000...
done in 213.533s.


In [53]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(ldaall, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
women men ms sexual woman abort sex girl young one equal right partner like said show say year violenc victim
Topic #1:
gun shoot kill violenc safeti weapon law death peopl carri shot ban nation arm public check state mass control use
Topic #2:
worker job wage work labor employ minimum pay employe hour rais state union would increas new higher forc benefit paid
Topic #3:
compani busi use would servic like rule corpor regul internet govern industri inform data could market agenc make technolog execut
Topic #4:
black polic white offic racial race depart forc minor communiti civil right protest justic arrest kill case enforc peopl system
Topic #5:
muslim attack french kill islam franc terrorist peopl terror pari death murder violenc fear bomb victim one mani threat year
Topic #6:
prison crime sentenc crimin state peopl convict year drug law arrest punish releas justic system search charg case victim mani
Topic #7:
like get peopl time dont one make thing go 

In [43]:
doc_topic_distrib_all = ldaall.transform(tfall)

## Fit Half

In [44]:
tf1 = tf_vectorizer.transform(tidy1)

In [15]:
print("Fitting LDA models with tf features and n_features=%d..."
      % (n_features))
lda1 = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda1.fit(tf1)
print("done in %0.3fs." % (time() - t0))


Fitting LDA models with tf features and n_features=1000...
done in 114.110s.


In [16]:
print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda1, tf_feature_names, n_top_words)


Topics in LDA model:
Topic #0:
women men sexual woman sex young girl one ms equal boy said show say man black bodi partner forc also
Topic #1:
loss peopl made ive three said lose last lost year long oil chang come parent deal thing ensur key money
Topic #2:
new citi york state de mayor mr public build park local hous bill resid year plan neighborhood governor system communiti
Topic #3:
would citizen like campaign unit decis trump money group limit north father court polit back candid could state way us
Topic #4:
vote peopl would sander attack feder support realli bring polic court report offic dont new clinton matter threat present presid
Topic #5:
one year day time peopl like would live first world two even mani place could us life ago long work
Topic #6:
prison crime sentenc crimin convict year death law punish justic releas peopl drug system execut case charg feder state correct
Topic #7:
gun clinton money campaign hillari polit spend speech group sander corpor contribut shoot mill

## Predict Half

In [45]:
doc_topic_distrib_pred_1 = ldaall.transform(tf1)

In [46]:
confusion = np.dot(doc_topic_distrib_all,doc_topic_distrib_pred_1.T)

In [47]:
confusion.shape

(9447, 9447)

In [48]:
np.save(open('confusion150.out','wb'),confusion)

In [18]:
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib qt

In [34]:
sns.heatmap(confusion[:500,:500])
sns.plt.show()