In [126]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import nltk

import gensim
from gensim.models.word2vec import Word2Vec # the word2vec model gensim class

from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

from tokenizer import tokenizer # Reddit version tokenizer

%matplotlib inline

In [191]:
# read the comment df
df_comment = pd.read_csv('Data/reddit.csv',
                         index_col = 0)

  interactivity=interactivity, compiler=compiler, result=result)


filter the controversial data 

In [192]:
# eliminate the controversial data 
df_text = df_comment[df_comment['controversiality'] == 0.0] 

In [193]:
# convert the datetime
df_text['created_utc'] = pd.to_datetime(df_text['created_utc'],
                                        unit = 's')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [41]:
# keep the body part
df_text = df_text[['created_utc', 'body']]

In [42]:
# filter out the [removed] and [deleted]
df_text_final = df_text[(df_text != '[removed]') & (df_text != '[deleted]')] 

In [50]:
# change the column name
df_text_final.columns = ['date', 'text']

# set the index
df_text_final.set_index('date', inplace= True)

In [3]:
df_text_final = pd.read_csv('text.csv',
                            index_col= 0,
                            parse_dates= True)

### Preprocessing the comment text

In [7]:
ReditToken = tokenizer.RedditTokenizer()

def tokenize(reddit):
    # reddit tokenize
    try:
        reddit = reddit.lower()
        tokens = ReditToken.tokenize(reddit)

        tokens = filter(lambda t: not t.startswith('@'), tokens)
        tokens = filter(lambda t: not t.startswith('#'), tokens)
        tokens = filter(lambda t: not t.startswith('&'), tokens)
        tokens = filter(lambda t: not t.startswith('http'), tokens)
        # not single letter
        tokens = filter(lambda t: len(t) > 1, tokens) 
        # alphabetic
        tokens = filter(lambda t: t.isalpha(), tokens) 
        return list(tokens)
    except:
        return None

In [5]:
df_text_final.dropna(inplace= True)

In [8]:
df_token = df_text_final.progress_applymap(tokenize)

progress-bar: 100%|██████████| 1045633/1045633 [07:20<00:00, 59.79it/s]  


In [9]:
# drop the NaN value
empty = df_token.iloc[4]

# filter out the empty item
df_token_final = df_token[df_token != empty]

# drop the NaN value
df_token_final.dropna(inplace = True)

In [11]:
df_token_final.to_csv('token.csv')

remove the stopwords

In [13]:
from nltk.corpus import stopwords

# Load stop words
stop_words = stopwords.words('english')

In [14]:
def stop_remove(words_array):
    filtor = filter(lambda x : x not in stop_words, words_array)
    return list(filtor)

In [15]:
df_token_fin = df_token_final.progress_applymap(stop_remove)

progress-bar: 100%|██████████| 1045632/1045632 [01:20<00:00, 283.66it/s]  


In [16]:
df_token_fin.to_csv('token.csv')

filter out into 5 minute

In [17]:
df_token_final_5min = df_token_fin.groupby(pd.Grouper(freq='5Min'))\
                                  .aggregate(sum)

In [18]:
df_token_final_1min = df_token_fin.groupby(pd.Grouper(freq='1Min'))\
                                  .aggregate(sum)

In [19]:
df_token_final_1min.to_csv('comment_1min.csv')
df_token_final_5min.to_csv('comment_5min.csv')

### Building the word2vec model

In [20]:
LabeledSentence = gensim.models.doc2vec.TaggedDocument

def labelizeComment(tweets, label_type):
    labelized = []
    for i,v in tqdm(enumerate(tweets)):
        label = '%s_%s'%(label_type,i)
        labelized.append(LabeledSentence(v, [label]))
    return labelized

In [26]:
token_arry = df_token_fin['text']

In [27]:
token_train = labelizeComment(token_arry, 'TRAIN')

1045632it [00:09, 115392.13it/s]


In [84]:
token_5min = df_token_final_5min['text']
token_5min_lab =labelizeComment(token_5min, 'TRAIN')

25920it [00:00, 592779.54it/s]


initialize model with parameters of 200 dimention

In [28]:
# set up the model
redit_w2v = Word2Vec(size= 200, min_count=10)

redit_w2v.build_vocab([x.words for x in tqdm(token_train)])

100%|██████████| 1045632/1045632 [00:00<00:00, 2545057.15it/s]


train the model

In [29]:
redit_w2v.train([x.words for x in tqdm(token_train)],
                epochs = 5,
                total_examples=redit_w2v.corpus_count)

100%|██████████| 1045632/1045632 [00:00<00:00, 2518569.54it/s]


(70036843, 73921410)

In [33]:
redit_w2v.most_similar('good')

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('great', 0.6828938722610474),
 ('bad', 0.6567687392234802),
 ('decent', 0.634770929813385),
 ('solid', 0.5886099338531494),
 ('terrible', 0.5533065795898438),
 ('best', 0.5105733275413513),
 ('promising', 0.48172563314437866),
 ('fantastic', 0.47255516052246094),
 ('interesting', 0.46857166290283203),
 ('excellent', 0.46548014879226685)]

In [37]:
redit_w2v.most_similar('bitcoin')

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('btc', 0.6342315673828125),
 ('litecoin', 0.6041586399078369),
 ('bch', 0.5904101133346558),
 ('bcash', 0.5653424263000488),
 ('bitcoins', 0.5540469288825989),
 ('legacy', 0.5360366106033325),
 ('altcoin', 0.53162682056427),
 ('altcoins', 0.5059296488761902),
 ('segwitcoin', 0.4716957211494446),
 ('usefulness', 0.46944889426231384)]

In [30]:
redit_w2v.most_similar('stock')

  if __name__ == '__main__':
  if np.issubdtype(vec.dtype, np.int):


[('stocks', 0.7257131338119507),
 ('bond', 0.6196423768997192),
 ('equity', 0.5903027057647705),
 ('equities', 0.5822152495384216),
 ('aapl', 0.5743989944458008),
 ('nasdaq', 0.5655592679977417),
 ('bonds', 0.5514918565750122),
 ('nyse', 0.5396426916122437),
 ('shares', 0.5338280200958252),
 ('forex', 0.5302181243896484)]

### Visualization

In [127]:
# importing bokeh library for interactive dataviz
import bokeh.plotting as bp
from bokeh.models import HoverTool, BoxSelectTool
from bokeh.plotting import figure, show, output_notebook

# defining the chart
output_notebook()
plot_tfidf = bp.figure(plot_width=700, plot_height=600, title="A map of 5000 word vectors",
    tools="pan,wheel_zoom,box_zoom,reset,hover,previewsave",
    x_axis_type=None, y_axis_type=None, min_border=1)

# getting a list of word vectors. limit to 500. each is of 200 dimensions
word_vectors = [redit_w2v[w] for w in list(redit_w2v.wv.vocab.keys())[:5000]]

In [39]:
# dimensionality reduction. converting the vectors to 2d vectors
from sklearn.manifold import TSNE
tsne_model = TSNE(n_components=2, verbose=1, random_state=0)
tsne_w2v = tsne_model.fit_transform(word_vectors)


[t-SNE] Computing pairwise distances...
[t-SNE] Computing 91 nearest neighbors...
[t-SNE] Computed conditional probabilities for sample 1000 / 5000
[t-SNE] Computed conditional probabilities for sample 2000 / 5000
[t-SNE] Computed conditional probabilities for sample 3000 / 5000
[t-SNE] Computed conditional probabilities for sample 4000 / 5000
[t-SNE] Computed conditional probabilities for sample 5000 / 5000
[t-SNE] Mean sigma: 0.934060
[t-SNE] KL divergence after 75 iterations with early exaggeration: 1.159792
[t-SNE] Error after 100 iterations: 1.159792


In [41]:
# putting everything in a dataframe
tsne_df = pd.DataFrame(tsne_w2v, columns=['x', 'y'])
tsne_df['words'] = list(redit_w2v.wv.vocab.keys())[:5000]

In [129]:
tsne_df.to_csv('Word-STNE-Plot.csv')

In [130]:
tsne_df

Unnamed: 0,x,y,words
0,-9.166865e+21,-2.173735e+21,brilliant
1,-6.596381e+21,3.516646e+21,wish
2,-5.037790e+21,5.029152e+21,guys
3,-4.615990e+22,-1.921408e+22,best
4,-1.714985e+22,2.243076e+21,luck
5,1.631067e+21,8.038850e+21,see
6,8.639972e+22,-9.798248e+21,name
7,-2.071796e+21,-2.971526e+21,buying
8,-3.431117e+21,-4.367504e+20,doge
9,-3.017575e+21,3.785327e+21,ripple


In [128]:
# plotting. the corresponding word appears when you hover on the data point.
plot_tfidf.scatter(x='x', y='y', source=tsne_df)
hover = plot_tfidf.select(dict(type=HoverTool))
hover.tooltips={"word": "@words"}
show(plot_tfidf)

In [73]:
redit_w2v.save('reddit-word2vec')

### Feature Engineer

In [74]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [122]:
#vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=10)
#matrix = vectorizer.fit_transform([x.words for x in token_5min_lab])
#tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

In [77]:
def buildWordVector(tokens, size):
    vec = np.zeros(size).reshape((1, size))
    count = 0.
    for word in tokens:
        try:
            vec += redit_w2v[word].reshape((1, size)) * tfidf[word]
            count += 1.
        except KeyError: # handling the case where the token is not
                         # in the corpus. useful for testing.
            continue
    if count != 0:
        vec /= count
    return vec

In [80]:
from sklearn.preprocessing import scale

In [121]:
# gen the vector
#vecs_w2v = np.concatenate([buildWordVector(z, 200) \
#                           for z in tqdm(map(lambda x: x.words,
#                                             token_5min_lab))])

In [None]:
# scaling 
vecs_w2v = scale(vecs_w2v)

In [103]:
df_feat = pd.DataFrame(vecs_w2v)

In [117]:
df_feature = df_feat.groupby(pd.Grouper(freq='5Min'))\
                    .aggregate('mean')

In [120]:
df_feature.to_csv('comment_vector_space.csv')