In [2]:
import numpy as np
import pandas as pd
import text_processing
import sys
import unicodedata
import matplotlib as mpl
import matplotlib.pyplot as plt
import topic_weights
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pickleizer
import os

In [3]:
%load_ext autoreload
%autoreload 2

# PreProcessing

## Load

In [88]:
subdir = 'final_csvs2'

In [89]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')

## Tidy

In [86]:
wordoption = 'lemma'

In [92]:
text = dataall['full_text']
tidiedtext = []
for i, item in enumerate(text):
    newtext = text_processing.tidy_text_noun_adj(item,wordoption=wordoption)
    tidiedtext.append(newtext)
    if i%100 == 0:
        print(i)
tidiedtextjoined = [' '.join(text) for text in tidiedtext]

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000
8100
8200
8300
8400
8500
8600
8700
8800
8900
9000
9100
9200
9300
9400
9500
9600


In [95]:
tidiedtextjoined2 = [text_processing.remove_punctuation(text) for text in tidiedtextjoined]

## Save

In [97]:
datatidy = pd.DataFrame(index=dataall.index,columns=[])
datatidy['TidiedText'] = tidiedtextjoined2
datatidy.to_csv(os.path.join(subdir,'datatidynounadj.csv'),encoding='utf-8')

## Load

In [38]:
datatidy = pd.DataFrame.from_csv(os.path.join(subdir,'datatidy.csv'),encoding='utf-8')

# NLTK/Sk-Learn

In [98]:
n_features = 1000
n_topics = 50
n_top_words = 20

In [99]:
# Use tf (raw term count) features for LDA.
tidiedtextjoined = datatidy['TidiedText']
tf_vectorizer = CountVectorizer(max_df=0.95,min_df=2,
                                max_features=n_features)
tf = tf_vectorizer.fit_transform(tidiedtextjoined)

In [100]:
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=10,
                                random_state=0)
lda.fit(tf)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_jobs=1, n_topics=50, perp_tol=0.1, random_state=0,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [101]:
tf_feature_names = tf_vectorizer.get_feature_names()
topic_weights.print_top_words(lda, tf_feature_names, n_top_words)

Topic #0:
sexual sex men assault abuse woman discrimination behavior identity social relationship man young facility treatment partner adult right body report
Topic #1:
health care patient doctor medical hospital cancer disease ebola case insurance treatment people cost center service many month new death
Topic #2:
school student college education university teacher high percent class year program many public state job kid rate degree child parent
Topic #3:
drug treatment use mental people health disease year pain medical high state new risk trial many rate death american study
Topic #4:
european europe union french france germany german country paris britain minister member crisis le national leader year state new front
Topic #5:
water climate energy gas oil environmental change emission year state global carbon plant power new fuel air natural river industry
Topic #6:
food animal park land human year scientist area mile site space village many science place new nature field natural c

In [102]:
doc_topic_distrib = lda.transform(tf)

In [103]:
doc_topic_distrib /= np.sum(doc_topic_distrib,axis=1)[:, np.newaxis]

In [104]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(doc_topic_distrib, cmap=plt.cm.gray, aspect = '0.005')
plt.show()

## Save

### Topic Weights

In [105]:
topic_weights.save_topic_weights(datatidy,doc_topic_distrib,
                   stem='lemma_noun_adj',package='sklearn',rows='all',subdir=subdir)

### LDA

In [60]:
pickleizer.save_topic_analyzer(lda,tf_vectorizer)

## Misc Analysis

In [3]:
lda, tf_vectorizer = pickleizer.load_topic_analyzer()

In [4]:
tf_feature_names = tf_vectorizer.get_feature_names()

In [5]:
text = u'WHEN I moved to Europe 12 years ago, my biggest concern was whether I’d ever speak decent French. Practically every American I knew came to visit, many saying they dreamed of living here, too. I didn’t worry much about far-right political parties, or the European Union. I certainly didn’t fret about terrorism.'

In [6]:
print(text)

WHEN I moved to Europe 12 years ago, my biggest concern was whether I’d ever speak decent French. Practically every American I knew came to visit, many saying they dreamed of living here, too. I didn’t worry much about far-right political parties, or the European Union. I certainly didn’t fret about terrorism.


In [7]:
tidiedtext = text_processing.tidy_text_and_join(text)

In [111]:
tidiedtext

u'move europ year ago biggest concern whether id ever speak decent french practic everi american knew came visit mani say dream live didnt worri much farright polit parti european union certain didnt fret terror'

In [8]:
document_term_matrix = tf_vectorizer.transform([tidiedtext])
weights = lda.transform(document_term_matrix)

In [9]:
document_term_matrix

<1x1000 sparse matrix of type '<type 'numpy.int64'>'
	with 28 stored elements in Compressed Sparse Row format>

In [113]:
weightsnorm = weights/np.sum(weights)

In [114]:
for topic, weight in zip(lda.components_,weightsnorm[0,:]):
    if weight > 0.05:
        print('Weight', weight)
        print(" ".join([tf_feature_names[i]
            for i in topic.argsort()[:-10 - 1:-1]]))


('Weight', 0.30488753477496194)
get say like one peopl dont go said want thing
('Weight', 0.35152753792222802)
one live like peopl life world year us day time
('Weight', 0.093731672333597207)
parti elect polit minist leader nation govern vote presid prime
('Weight', 0.13031220950545552)
european union europ britain british franc germani minist member nation
('Weight', 0.089541045463752925)
french attack franc pari terrorist terror secur kill bomb threat


# GenSim

In [21]:
from gensim import corpora, models, similarities

## Dictionary

In [22]:
dictionary = corpora.Dictionary(texts)
bad_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 10]
dictionary.filter_tokens(bad_ids) # remove words with low frequencies
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(13603 unique tokens: [u'yellow', u'narcotic', u'four', u'jihad', u'hanging']...)


## Corpus

In [23]:
corpus = [dictionary.doc2bow(text) for text in texts]

## Vectorizer

In [24]:
ldacount = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=50)

In [38]:
topicweights = ldacount.inference(corpus)

In [40]:
doc_topic_distrib = topicweights[0]

In [85]:
doc_topic_distrib /= np.sum(doc_topic_distrib,axis=1)[:, np.newaxis]

In [86]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(doc_topic_distrib, cmap=plt.cm.gray, aspect = '0.05')
plt.show()

In [87]:
topic_weights.save_topic_weights(datareg,doc_topic_distrib,stem='lemma',package='gensim',rows='reg')

# Doc2Vec

In [79]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for doc, label in zip(self.doc_list, self.labels_list):
            yield models.doc2vec.TaggedDocument(words=doc,tags=[label])

In [80]:
labels = ['{}'.format(i) for i, _ in enumerate(texts)]
sentences = LabeledLineSentence(texts,labels)

In [81]:
model = models.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=5)

In [82]:
model.build_vocab(sentences)

In [83]:
model.train(sentences)

11051150

In [87]:
doc2vecres = np.zeros((len(texts),100))
for i, text in enumerate(texts):
    doc2vecres[i,:] = model.infer_vector(text)

In [95]:
doc2vecres[0,10]

-0.09356486052274704

In [93]:
topic_weights.save_topic_weights(dataall[idxauthor],doc2vecres,stem=None,package='doc2vec',rows='reg')

In [85]:
model.most_similar(['obama','sander','clinton'])

[(u'hasinas', 0.7176132202148438),
 (u'fiorina', 0.6785053014755249),
 (u'fiorinas', 0.6547480821609497),
 (u'obamas', 0.6400838494300842),
 (u'cuomo', 0.6099336743354797),
 (u'kirchner', 0.6086978912353516),
 (u'johnson', 0.601860523223877),
 (u'mccullen', 0.6011120080947876),
 (u'pathetically', 0.6010297536849976),
 (u'reagan', 0.593456506729126)]

In [134]:
model.infer_vector(texts[0])

array([-0.10193164, -0.04376977, -0.13766503,  0.0823018 , -0.02074894,
        0.20367937,  0.21111564, -0.05945964,  0.37171602,  0.00507934,
       -0.09356486,  0.10768178, -0.07886085, -0.00511813, -0.00253258,
        0.04099534,  0.08943591,  0.20622362, -0.02915514, -0.26442051,
        0.12644652,  0.04535156,  0.27022526, -0.12068559,  0.03253226,
        0.04558017,  0.18684271,  0.19145858,  0.13811885,  0.23620991,
        0.19093932, -0.10907025, -0.0671132 ,  0.09404733,  0.01839629,
        0.05211755, -0.12075779,  0.19123247,  0.11745412, -0.2026743 ,
       -0.02890574,  0.015503  , -0.1583606 ,  0.00218541,  0.00916504,
       -0.07925086, -0.28470385, -0.0842441 ,  0.28052595, -0.25641757,
       -0.08566134,  0.08883084, -0.2375012 ,  0.12778269, -0.28805211,
       -0.08686749,  0.11690425, -0.13369317,  0.2372302 ,  0.01629356,
        0.14770885,  0.07089438, -0.2603536 ,  0.16426481, -0.02648615,
        0.10391705,  0.0802548 , -0.21442699,  0.20937781, -0.20

# T-SNE

In [44]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
%matplotlib qt

In [64]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target)
plt.subplot(122)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target)
plt.show()

In [89]:
X_tsne = TSNE(learning_rate=100,verbose=1).fit_transform(doc2vecres[idxauthor2.as_matrix(),:])

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 3010
[t-SNE] Computed conditional probabilities for sample 2000 / 3010
[t-SNE] Computed conditional probabilities for sample 3000 / 3010
[t-SNE] Computed conditional probabilities for sample 3010 / 3010
[t-SNE] Mean sigma: 0.354876
[t-SNE] Error after 100 iterations with early exaggeration: 2.142301
[t-SNE] Error after 350 iterations: 1.969357


In [90]:
authorid = dataauthor['authorid'][idxauthor][idxauthor2]

In [91]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=authorid[idxauthor].as_matrix())
plt.show()