In [1]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import topic_weights as tw
import os
%matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

# Load Data

In [3]:
subdir = 'final_csvs2'

In [4]:
dataall = pd.DataFrame.from_csv(os.path.join(subdir,'dataall.csv'),encoding='utf-8')
datanonother = pd.DataFrame.from_csv(os.path.join(subdir,'datanonother.csv'),encoding='utf-8')

In [5]:
n_topics = 50
stem = 'stem'
package = 'sklearn'
rows = 'all'
twcsv = tw.topic_weights_csv(n_topics,stem,package,rows)
topicweights = pd.DataFrame.from_csv(os.path.join(subdir,twcsv))

# T-SNE

In [6]:
X_tsne = TSNE(learning_rate=100,verbose=1).fit_transform(topicweights)

[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 9662
[t-SNE] Computed conditional probabilities for sample 2000 / 9662
[t-SNE] Computed conditional probabilities for sample 3000 / 9662
[t-SNE] Computed conditional probabilities for sample 4000 / 9662
[t-SNE] Computed conditional probabilities for sample 5000 / 9662
[t-SNE] Computed conditional probabilities for sample 6000 / 9662
[t-SNE] Computed conditional probabilities for sample 7000 / 9662
[t-SNE] Computed conditional probabilities for sample 8000 / 9662
[t-SNE] Computed conditional probabilities for sample 9000 / 9662
[t-SNE] Computed conditional probabilities for sample 9662 / 9662
[t-SNE] Mean sigma: 0.107976
[t-SNE] Error after 50 iterations with early exaggeration: 1.553268
[t-SNE] Error after 75 iterations: 1.553248


# Topic IDs

In [7]:
from collections import Counter
import operator as op
from model_predict import LDA, TF

In [8]:
tf_feature_names = TF.get_feature_names()

In [9]:
topicids = np.argmax(topicweights.as_matrix(),axis=1)

In [10]:
count = Counter(topicids)
sortedcounts = sorted(count.items(),key=op.itemgetter(1),reverse=True)

In [11]:
for sortedcount in sortedcounts[:15]:
    idx = sortedcount[0]
    print(idx, tw.top_words(LDA,tf_feature_names,n_top_words=8,topic_idx=idx))

(1, u'get say like one peopl dont go said')
(3, u'one live like peopl life world year us')
(33, u'polic offic prison crime crimin sentenc year arrest')
(12, u'state militari unit war forc govern turkey secur')
(18, u'health drug patient medic doctor diseas hospit cancer')
(47, u'govern polit power peopl democraci protest countri corrupt')
(21, u'bank financi chang economi polici would econom govern')
(11, u'parti elect polit minist leader nation govern vote')
(4, u'peopl like studi one experi research differ work')
(43, u'republican democrat voter polit candid parti elect campaign')
(5, u'court law rule state judg case feder legal')
(42, u'percent rate econom american incom growth year social')
(22, u'citi new york state mayor de public plan')
(41, u'tax money would million spend fund billion budget')
(10, u'bill state would senat law congress republican legisl')


In [12]:
besttopics = {33: 'Police and crime',
              12: 'Foreign policy',
              18: 'Health care',
              47: 'Government',
              21: 'Banks and finance',
              11: 'Elections',
              43: 'Domestic politics',
              5: 'Courts and law',
              42: 'Economy',
              22: 'New York'}

In [13]:
topicidsnew = [besttopics.get(topicid,'None') for topicid in topicids]

In [14]:
df = pd.DataFrame(index=topicweights.index,
                  columns=['Component 1','Component 2','Topic ID','Author','URL'])

In [15]:
df['Component 1'] = X_tsne[:,0]
df['Component 2'] = X_tsne[:,1]
df['Topic ID'] = topicidsnew
df['Author'] = dataall['author']
df['Title'] = dataall['title']
df['URL'] = dataall['url']
df['Share Count'] = dataall['share_count']

In [16]:
dffinal = df[df['Topic ID'] != 'None']

In [80]:
dffinal.to_csv('topics.tsv',encoding='utf-8',index=False,sep='\t')

In [None]:
[
    {"a":"-1.14","b":"4.14"},
    {"a":"-0.13","b":"1.38"},
    {"a":"-4.19","b":"1.43"},
    {"a":"-0.21","b":"3.34"}
]

In [44]:
def gen_row(row,keys1,keys2):
    s = []
    for key1, key2 in zip(keys1,keys2):
        item = row[key1]
        s.append('''{0}'': {1}'.format(key2,item))
    return '{{{0}}}'.format(', '.join(s))

In [45]:
keys1 = ['Component 1','Component 2','Share Count']
keys2 = ['one','two','shares']

In [46]:
s = []
for row in dffinal.iloc[:20].iterrows():
    print(gen_row(row[1],keys1,keys2) + ',')

{one: 0.000916395266272, two: -0.0027222379161, shares: 135},
{one: -0.00104158193887, two: 0.000245404376291, shares: 145},
{one: 0.00185864842539, two: -0.000597570293171, shares: 466},
{one: -0.000496173225977, two: -0.0025408805305, shares: 1494},
{one: 0.00302073107182, two: -0.000727708550212, shares: 114},
{one: -0.000779370797775, two: -0.00303686885503, shares: 211},
{one: 0.000452177735513, two: 0.00207540008723, shares: 413},
{one: 0.000118978878752, two: 0.000564002286909, shares: 8816},
{one: 0.000548660807326, two: 0.000290071406657, shares: 88},
{one: -2.74087616495e-05, two: 0.000655443146176, shares: 675},
{one: -0.00198309160081, two: -0.000482106330015, shares: 1429},
{one: 0.000594292652241, two: 0.000831509489862, shares: 936},
{one: 0.00245959218863, two: 0.00220004383402, shares: 476},
{one: -0.00125666961239, two: -0.00168854829257, shares: 487},
{one: -0.000511307618529, two: -0.00240663711751, shares: 499},
{one: -0.00445963544528, two: 0.000304057512728, shar