In [11]:
import numpy as np
import pandas as pd
import text_processing
import sys
import unicodedata
import matplotlib as mpl
import matplotlib.pyplot as plt
import topic_weights

In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Load!

In [13]:
dataall = pd.DataFrame.from_csv('dataall2.csv')
dataauthor = pd.DataFrame.from_csv('dataauthor2_38.csv')

In [14]:
idxauthor = dataauthor['author0'] == 0
idxauthor2 = dataauthor[idxauthor]['author12'] == 0

In [15]:
datareg = dataall[idxauthor]

# Tidy Funcs

In [16]:
def gen_tidied_text(dataall,wordoption=None):
    for text in dataall['full_text']:
        yield text_processing.tidy_text(text.decode('utf-8'),wordoption=wordoption)

# Texts

## Trial Texts

In [17]:
n = 1000
textstrial = list(gen_tidied_text(dataall.iloc[:n],wordoption='lemma'))

## All Texts

In [18]:
textsall = list(gen_tidied_text(dataall,wordoption='lemma'))

## Just Regular Contributors

In [19]:
textsreg = list(gen_tidied_text(datareg,wordoption='lemma'))

In [137]:
len(textsall)

9447

## Assign

In [20]:
texts = textsreg

# GenSim

In [21]:
from gensim import corpora, models, similarities

## Dictionary

In [22]:
dictionary = corpora.Dictionary(texts)
bad_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 10]
dictionary.filter_tokens(bad_ids) # remove words with low frequencies
dictionary.compactify() # remove gaps in id sequence after words that were removed
print(dictionary)

Dictionary(13603 unique tokens: [u'yellow', u'narcotic', u'four', u'jihad', u'hanging']...)


## Corpus

In [23]:
corpus = [dictionary.doc2bow(text) for text in texts]

## Vectorizer

In [24]:
ldacount = models.ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=50)

In [38]:
topicweights = ldacount.inference(corpus)

In [40]:
doc_topic_distrib = topicweights[0]

In [85]:
doc_topic_distrib /= np.sum(doc_topic_distrib,axis=1)[:, np.newaxis]

In [86]:
fig = plt.figure(1)
ax = fig.gca()
ax = ax.matshow(doc_topic_distrib, cmap=plt.cm.gray, aspect = '0.05')
plt.show()

In [87]:
topic_weights.save_topic_weights(datareg,doc_topic_distrib,stem='lemma',package='gensim',rows='reg')

# Doc2Vec

In [79]:
class LabeledLineSentence(object):
    def __init__(self, doc_list, labels_list):
       self.labels_list = labels_list
       self.doc_list = doc_list
    def __iter__(self):
        for doc, label in zip(self.doc_list, self.labels_list):
            yield models.doc2vec.TaggedDocument(words=doc,tags=[label])

In [80]:
labels = ['{}'.format(i) for i, _ in enumerate(texts)]
sentences = LabeledLineSentence(texts,labels)

In [81]:
model = models.Doc2Vec(dm=1, dm_concat=1, size=100, window=5, negative=5, hs=0, min_count=5)

In [82]:
model.build_vocab(sentences)

In [83]:
model.train(sentences)

11051150

In [87]:
doc2vecres = np.zeros((len(texts),100))
for i, text in enumerate(texts):
    doc2vecres[i,:] = model.infer_vector(text)

In [95]:
doc2vecres[0,10]

-0.09356486052274704

In [93]:
topic_weights.save_topic_weights(dataall[idxauthor],doc2vecres,stem=None,package='doc2vec',rows='reg')

In [85]:
model.most_similar(['obama','sander','clinton'])

[(u'hasinas', 0.7176132202148438),
 (u'fiorina', 0.6785053014755249),
 (u'fiorinas', 0.6547480821609497),
 (u'obamas', 0.6400838494300842),
 (u'cuomo', 0.6099336743354797),
 (u'kirchner', 0.6086978912353516),
 (u'johnson', 0.601860523223877),
 (u'mccullen', 0.6011120080947876),
 (u'pathetically', 0.6010297536849976),
 (u'reagan', 0.593456506729126)]

In [134]:
model.infer_vector(texts[0])

array([-0.10193164, -0.04376977, -0.13766503,  0.0823018 , -0.02074894,
        0.20367937,  0.21111564, -0.05945964,  0.37171602,  0.00507934,
       -0.09356486,  0.10768178, -0.07886085, -0.00511813, -0.00253258,
        0.04099534,  0.08943591,  0.20622362, -0.02915514, -0.26442051,
        0.12644652,  0.04535156,  0.27022526, -0.12068559,  0.03253226,
        0.04558017,  0.18684271,  0.19145858,  0.13811885,  0.23620991,
        0.19093932, -0.10907025, -0.0671132 ,  0.09404733,  0.01839629,
        0.05211755, -0.12075779,  0.19123247,  0.11745412, -0.2026743 ,
       -0.02890574,  0.015503  , -0.1583606 ,  0.00218541,  0.00916504,
       -0.07925086, -0.28470385, -0.0842441 ,  0.28052595, -0.25641757,
       -0.08566134,  0.08883084, -0.2375012 ,  0.12778269, -0.28805211,
       -0.08686749,  0.11690425, -0.13369317,  0.2372302 ,  0.01629356,
        0.14770885,  0.07089438, -0.2603536 ,  0.16426481, -0.02648615,
        0.10391705,  0.0802548 , -0.21442699,  0.20937781, -0.20

# T-SNE

In [44]:
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from sklearn.datasets import load_iris
%matplotlib qt

In [63]:
iris = load_iris()
X_tsne = TSNE(learning_rate=100,verbose=1).fit_transform(iris.data)
X_pca = PCA().fit_transform(iris.data)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 150 / 150
[t-SNE] Mean sigma: 0.511967
[t-SNE] Error after 100 iterations with early exaggeration: 0.163522
[t-SNE] Error after 125 iterations: 0.156081


In [64]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=iris.target)
plt.subplot(122)
plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target)
plt.show()

In [89]:
X_tsne = TSNE(learning_rate=100,verbose=1).fit_transform(doc2vecres[idxauthor2.as_matrix(),:])

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 3010
[t-SNE] Computed conditional probabilities for sample 2000 / 3010
[t-SNE] Computed conditional probabilities for sample 3000 / 3010
[t-SNE] Computed conditional probabilities for sample 3010 / 3010
[t-SNE] Mean sigma: 0.354876
[t-SNE] Error after 100 iterations with early exaggeration: 2.142301
[t-SNE] Error after 350 iterations: 1.969357


In [90]:
authorid = dataauthor['authorid'][idxauthor][idxauthor2]

In [91]:
plt.figure(figsize=(10, 5))
plt.subplot(121)
plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=authorid[idxauthor].as_matrix())
plt.show()