### Imports

In [3]:
import pandas as pd
import numpy as np

import random
import re
from collections import Counter
import time
import pickle
import operator

from pymongo import MongoClient

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD
from sklearn.preprocessing import Normalizer
from textblob import TextBlob
from nltk.corpus import stopwords
import nltk

In [4]:
# timing function
def timefunc(f):
    def f_timer(*args, **kwargs):
        start = time.time()
        result = f(*args, **kwargs)
        end = time.time()
        print(f.__name__, 'took', end - start, 'seconds')
        return result
    return f_timer

### Get pickled sample data

In [5]:
with open('master.pkl','rb') as p:
    com_trans = pickle.load(p)

In [6]:
print(com_trans.shape)
com_trans.head()

(100000, 4)


Unnamed: 0,comment,transcript,com_tran_list,TED=1
0,Transgenderism is a mental fucking disease ! N...,"(Music) (Applause) Trevor Copp: When ""Dancing ...",[Transgenderism is a mental fucking disease ! ...,0
1,This is a wonderfully informative and hopeful ...,"Thank you so much, Chris. And it's truly a gre...",[This is a wonderfully informative and hopeful...,1
2,"As you are a Climate Change denier, I feel obl...","Thank you so much, Chris. And it's truly a gre...","[As you are a Climate Change denier, I feel ob...",1
3,if this started migrating itself into a normal...,"(Music) (Applause) Trevor Copp: When ""Dancing ...",[if this started migrating itself into a norma...,0
4,We are applauding our own absence is a powerfu...,"(Music) (Applause) Trevor Copp: When ""Dancing ...",[We are applauding our own absence is a powerf...,0


In [7]:
# df['TED=1'][230000:280000].sum()

In [8]:
all_comments = list(com_trans.comment)

In [9]:
comments = all_comments
len(comments)

100000

### Transcripts - pre-processing

In [10]:
names = nltk.corpus.names
male_names = names.words('male.txt')
female_names = names.words('female.txt')
male_names = [w.lower() for w in male_names]
male_names_plur = [(w.lower() + "s") for w in male_names]
female_names_plur = [(w.lower() + "s") for w in female_names]
female_names = [w.lower() for w in female_names]
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
selected_words = ['',
                  'laughter',
                  'applause',
                  'aa',
                  'aaa',
                  'aaaaa',
                  'aaaaaah',
                  'aaaah',
                  'aah',
                  'ab',
                  'ababa',
                  'abacha',
                  'aback',]

In [12]:
stoppers = set(list(stopwords) + list(selected_words) + list(ENGLISH_STOP_WORDS) + 
               list(female_names) + list(male_names) + list(female_names_plur) + list(male_names_plur))
stoppers = list(stoppers)

In [13]:
# import csv

# with open('stoppers.csv', 'w') as myfile:
#     wr = csv.writer(myfile, quoting=csv.QUOTE_ALL)
#     wr.writerow([stoppers])

In [14]:
# load nltk's SnowballStemmer as variabled 'stemmer'
from nltk.stem.snowball import SnowballStemmer
stemmer = SnowballStemmer("english")

In [15]:
# # Use regular expressions to do a find-and-replace
# def tokenize_and_stem(text):
#     # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
#     tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
#     filtered_tokens = []
#     # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
#     for token in tokens:
#         if re.search('[a-zA-Z]', token):
#             filtered_tokens.append(token)
#     stems = [stemmer.stem(t) for t in filtered_tokens]
#     return stems

In [16]:
tv = TfidfVectorizer(strip_accents='ascii',
                     max_df=0.8, 
                     max_features=200000,
                     min_df=5,
                     analyzer='word',
                     stop_words=stoppers)

In [17]:
@timefunc
def vectorize(sample):
    ts_vec = tv.fit_transform(sample)
    df_ts = pd.DataFrame(ts_vec.todense(), columns=[tv.get_feature_names()])
    return df_ts

In [18]:
ts_vec = vectorize(comments)
ts_vec.head()
ts_vec.shape

vectorize took 6.74557900428772 seconds


(100000, 23701)

In [19]:
# with open('final_comments_vec.pkl','wb') as picklefile:
#     pickle.dump(ts_vec, picklefile)

In [20]:
# with open('final_comments_vec.pkl','rb') as picklefile:
#     ts_vec = pickle.load(picklefile)

#### Model with SVD

In [21]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i]
                        for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

In [22]:
topics = 10
top_words = 40

lsa = TruncatedSVD(n_components=topics,algorithm='randomized').fit(ts_vec)

print("\nTopics in LSA model:")
tfidf_feature_names = tv.get_feature_names()
print("\nExplained variance ratio", lsa.explained_variance_ratio_)
print(print_top_words(lsa, tfidf_feature_names, top_words))


Topics in LSA model:

Explained variance ratio [ 0.00295218  0.00482     0.0042314   0.00418687  0.00337323  0.00310548
  0.00298784  0.00266102  0.00259735  0.00253535]
Topic #0:
people talk like think great good really thank world know life time make need things want work amazing video point right better idea agree believe thing new thanks human said interesting years feel change use lot understand best help going
Topic #1:
thank great talk amazing inspiring wow thanks awesome sharing beautiful wonderful brilliant speech video nice presentation story loved excellent inspirational fantastic interesting inspiration enjoyed inspired absolutely truly informative talks message favorite fascinating best incredible powerful sir speaker mr watched brave
Topic #2:
amazing thank wow people like beautiful world life know sharing need god want human simply said think truly absolutely things technology story make woman right believe time person better wish wonderful live inspiring change words h

In [27]:
with open('lsa_model.pkl','wb') as p:
    pickle.dump(lsa,p)

### pick up pickle

In [28]:
with open('lsa_model.pkl','rb') as p:
    lsa = pickle.load(p)

In [29]:
lsa

TruncatedSVD(algorithm='randomized', n_components=10, n_iter=5,
       random_state=None, tol=0.0)

In [30]:
truncated_matrix = lsa.transform(ts_vec)
trunc_features = pd.DataFrame(truncated_matrix)
trunc_features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.016524,-0.006253,0.002719,4e-06,-0.003015,0.006835,-0.004977,-0.00446,-0.003085,-0.004221
1,0.161739,0.053766,0.060058,-0.112828,0.12443,-0.086072,-0.071893,-0.037568,-0.0223,-0.015814
2,0.10855,-0.04727,0.012682,-0.010828,-0.026289,-0.005126,0.007296,-0.025012,-0.0308,-0.008216
3,0.07003,-0.021546,0.007528,-0.000834,-0.007299,0.003508,-0.005931,-0.004316,-0.003534,-0.007534
4,0.014035,-0.000321,0.001313,-0.002434,-0.00132,0.002695,0.002846,0.000123,-0.0049,0.003657


In [31]:
trunc_features.shape

(100000, 10)

In [32]:
with open('svd_comments.pkl','wb') as picklefile:
    pickle.dump(trunc_features, picklefile)

### pick up pickle

In [33]:
with open('svd_comments.pkl','rb') as picklefile:
    trunc_features = pickle.load(picklefile)

In [34]:
with open('target.pkl','rb') as picklefile:
    target = pickle.load(picklefile)

In [35]:
trunc_features['target'] = target

In [46]:
print(trunc_features.shape)
trunc_features.head()

(100000, 11)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,target
0,0.016524,-0.006253,0.002719,4e-06,-0.003015,0.006835,-0.004977,-0.00446,-0.003085,-0.004221,0
1,0.161739,0.053766,0.060058,-0.112828,0.12443,-0.086072,-0.071893,-0.037568,-0.0223,-0.015814,1
2,0.10855,-0.04727,0.012682,-0.010828,-0.026289,-0.005126,0.007296,-0.025012,-0.0308,-0.008216,0
3,0.07003,-0.021546,0.007528,-0.000834,-0.007299,0.003508,-0.005931,-0.004316,-0.003534,-0.007534,0
4,0.014035,-0.000321,0.001313,-0.002434,-0.00132,0.002695,0.002846,0.000123,-0.0049,0.003657,0


In [55]:
TED_dist = trunc_features[trunc_features['target'] == 1]
YT_dist = trunc_features[trunc_features['target'] == 0]

In [56]:
print(TED_dist.shape)
print(YT_dist.shape)

(60295, 11)
(39705, 11)


In [57]:
TED_dist = TED_dist.drop('target',axis=1)
YT_dist = YT_dist.drop('target',axis=1)

In [58]:
print(TED_dist.mean())
print(YT_dist.mean())

0    0.080702
1   -0.003586
2    0.003508
3   -0.000470
4   -0.001641
5    0.002763
6   -0.001242
7    0.002664
8   -0.002860
9   -0.000450
dtype: float64
0    0.080418
1   -0.003315
2    0.004130
3   -0.000334
4   -0.001779
5    0.002483
6   -0.001337
7    0.001947
8   -0.002370
9   -0.000314
dtype: float64


In [74]:
x = np.array(TED_dist.mean())
y = np.array(YT_dist.mean())

In [78]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline



array([ 0.08070192, -0.00358598,  0.00350844, -0.00047027, -0.00164102,
        0.00276283, -0.0012423 ,  0.00266448, -0.00285986, -0.00045005])