In [46]:
import numpy as np
import pandas as pd
import text_processing
import sys
import unicodedata
import matplotlib as mpl
import matplotlib.pyplot as plt
import topic_weights
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickleizer
import os

In [4]:
%load_ext autoreload
%autoreload 2

# PreProcessing

## Load

In [45]:
subdir = 'final_csvs2'

In [47]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

## Tidy

In [48]:
wordoption = 'stem'
text = dataall['full_text']
tidiedtext = [text_processing.tidy_text(item,wordoption=wordoption) for item in text]
tidiedtextjoined = [' '.join(text) for text in tidiedtext]

## Save

In [49]:
datatidy = pd.DataFrame(index=dataall.index,columns=[])
datatidy['TidiedText'] = tidiedtextjoined
datatidy.to_csv('datatidy.csv',encoding='utf-8')

# NLTK/Sk-Learn

In [50]:
n_features = 1000
n_topics = 50
n_top_words = 20

In [51]:
# Use tf (raw term count) features for LDA.
tidiedtextjoined = datatidy['TidiedText']
tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,
                                max_features=n_features)
tf = tf_vectorizer.fit_transform(tidiedtextjoined)

In [52]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=50, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [53]:
tf_feature_names = tf_vectorizer.get_feature_names()
topic_weights.print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
women men sexual woman sex abort girl marriag young one equal man partner right like year show percent also bodi
Topic #1:
get say like one peopl dont go said want thing know think would make way even us that good look
Topic #2:
one year day time two new last car week first month could still back three show even turn end anoth
Topic #3:
one live like peopl life world year us day time work love friend would see never place mani first feel
Topic #4:
peopl like studi one experi research differ work may person less use make found way might often brain effect find
Topic #5:
court law rule state judg case feder legal suprem right lawyer appeal decis constitut would requir violat could district hear
Topic #6:
obama presid american administr america state unit washington polici year said congress hous white nation bush countri foreign last john
Topic #7:
countri world unit global trade oil intern state nation agreement develop would econom foreign like africa economi deal could impor

In [54]:
doc_topic_distrib = lda.transform(tf)

In [55]:
doc_topic_distrib /= np.sum(doc_topic_distrib,axis=1)[:, np.newaxis]

In [56]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(doc_topic_distrib, cmap=plt.cm.gray, aspect = '0.005')
plt.show()

## Save

### Topic Weights

In [59]:
topic_weights.save_topic_weights(datatidy,doc_topic_distrib,
                   stem=wordoption,package='sklearn',rows='all',subdir=subdir)

### LDA

In [60]:
pickleizer.save_topic_analyzer(lda,tf_vectorizer)

# GenSim

In [21]:
from gensim import corpora, models, similarities

## Dictionary

In [22]:
dictionary = corpora.Dictionary(texts)
bad_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 10]
dictionary.filter_tokens(bad_ids) # remove words with low frequencies
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(13603 unique tokens: [u'yellow', u'narcotic', u'four', u'jihad', u'hanging']...)


## Corpus

In [23]:
corpus = [dictionary.doc2bow(text) for text in texts]

## Vectorizer

In [24]:
ldacount = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=50)

In [38]:
topicweights = ldacount.inference(corpus)

In [40]:
doc_topic_distrib = topicweights[0]

In [85]:
doc_topic_distrib /= np.sum(doc_topic_distrib,axis=1)[:, np.newaxis]

In [86]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(doc_topic_distrib, cmap=plt.cm.gray, aspect = '0.05')
plt.show()

In [87]:
topic_weights.save_topic_weights(datareg,doc_topic_distrib,stem='lemma',package='gensim',rows='reg')

# Doc2Vec

In [79]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for doc, label in zip(self.doc_list, self.labels_list):
            yield models.doc2vec.TaggedDocument(words=doc,tags=[label])

In [80]:
labels = ['{}'.format(i) for i, _ in enumerate(texts)]
sentences = LabeledLineSentence(texts,labels)

In [81]:
model = models.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=5)

In [82]:
model.build_vocab(sentences)

In [83]:
model.train(sentences)

11051150

In [87]:
doc2vecres = np.zeros((len(texts),100))
for i, text in enumerate(texts):
    doc2vecres[i,:] = model.infer_vector(text)

In [95]:
doc2vecres[0,10]

-0.09356486052274704

In [93]:
topic_weights.save_topic_weights(dataall[idxauthor],doc2vecres,stem=None,package='doc2vec',rows='reg')

In [85]:
model.most_similar(['obama','sander','clinton'])

[(u'hasinas', 0.7176132202148438),
 (u'fiorina', 0.6785053014755249),
 (u'fiorinas', 0.6547480821609497),
 (u'obamas', 0.6400838494300842),
 (u'cuomo', 0.6099336743354797),
 (u'kirchner', 0.6086978912353516),
 (u'johnson', 0.601860523223877),
 (u'mccullen', 0.6011120080947876),
 (u'pathetically', 0.6010297536849976),
 (u'reagan', 0.593456506729126)]

In [134]:
model.infer_vector(texts[0])

array([-0.10193164, -0.04376977, -0.13766503,  0.0823018 , -0.02074894,
        0.20367937,  0.21111564, -0.05945964,  0.37171602,  0.00507934,
       -0.09356486,  0.10768178, -0.07886085, -0.00511813, -0.00253258,
        0.04099534,  0.08943591,  0.20622362, -0.02915514, -0.26442051,
        0.12644652,  0.04535156,  0.27022526, -0.12068559,  0.03253226,
        0.04558017,  0.18684271,  0.19145858,  0.13811885,  0.23620991,
        0.19093932, -0.10907025, -0.0671132 ,  0.09404733,  0.01839629,
        0.05211755, -0.12075779,  0.19123247,  0.11745412, -0.2026743 ,
       -0.02890574,  0.015503  , -0.1583606 ,  0.00218541,  0.00916504,
       -0.07925086, -0.28470385, -0.0842441 ,  0.28052595, -0.25641757,
       -0.08566134,  0.08883084, -0.2375012 ,  0.12778269, -0.28805211,
       -0.08686749,  0.11690425, -0.13369317,  0.2372302 ,  0.01629356,
        0.14770885,  0.07089438, -0.2603536 ,  0.16426481, -0.02648615,
        0.10391705,  0.0802548 , -0.21442699,  0.20937781, -0.20

# T-SNE

In [44]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
%matplotlib qt

In [64]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target)
plt.subplot(122)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target)
plt.show()

In [89]:
X_tsne = TSNE(learning_rate=100,verbose=1).fit_transform(doc2vecres[idxauthor2.as_matrix(),:])

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 3010
[t-SNE] Computed conditional probabilities for sample 2000 / 3010
[t-SNE] Computed conditional probabilities for sample 3000 / 3010
[t-SNE] Computed conditional probabilities for sample 3010 / 3010
[t-SNE] Mean sigma: 0.354876
[t-SNE] Error after 100 iterations with early exaggeration: 2.142301
[t-SNE] Error after 350 iterations: 1.969357


In [90]:
authorid = dataauthor['authorid'][idxauthor][idxauthor2]

In [91]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=authorid[idxauthor].as_matrix())
plt.show()