In [1]:
import pandas as pd  
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [3]:
csv = 'clean_tweet.csv'
my_df = pd.read_csv(csv,index_col=0)
my_df.head()

Unnamed: 0,text,target
0,awww that bummer you shoulda got david carr of...,0
1,is upset that he can not update his facebook b...,0
2,dived many times for the ball managed to save ...,0
3,my whole body feels itchy and like its on fire,0
4,no it not behaving at all mad why am here beca...,0


In [4]:
my_df.dropna(inplace=True)
my_df.reset_index(drop=True,inplace=True)
my_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1596019 entries, 0 to 1596018
Data columns (total 2 columns):
text      1596019 non-null object
target    1596019 non-null int64
dtypes: int64(1), object(1)
memory usage: 24.4+ MB


## Train / Dev / Test Split

In [5]:
x = my_df.text
y = my_df.target

In [7]:
from sklearn.cross_validation import train_test_split
SEED = 2000
x_train, x_validation_and_test, y_train, y_validation_and_test = train_test_split(x, y, test_size=.02, random_state=SEED)
x_validation, x_test, y_validation, y_test = train_test_split(x_validation_and_test, y_validation_and_test, test_size=.5, random_state=SEED)

In [8]:
print "Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_train),
                                                                             (len(x_train[y_train == 0]) / (len(x_train)*1.))*100,
                                                                            (len(x_train[y_train == 1]) / (len(x_train)*1.))*100)
print "Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_validation),
                                                                             (len(x_validation[y_validation == 0]) / (len(x_validation)*1.))*100,
                                                                            (len(x_validation[y_validation == 1]) / (len(x_validation)*1.))*100)
print "Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(x_test),
                                                                             (len(x_test[y_test == 0]) / (len(x_test)*1.))*100,
                                                                            (len(x_test[y_test == 1]) / (len(x_test)*1.))*100)

Train set has total 1564098 entries with 50.00% negative, 50.00% positive
Validation set has total 15960 entries with 50.40% negative, 49.60% positive
Test set has total 15961 entries with 50.26% negative, 49.74% positive


In [175]:
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence
import multiprocessing
from sklearn import utils
from sklearn.linear_model import LogisticRegression

In [1]:
def get_vectors(model, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = model.docvecs[prefix]
        n += 1
    return vecs

In [15]:
def get_concat_vectors(model1,model2, corpus, size):
    vecs = np.zeros((len(corpus), size))
    n = 0
    for i in corpus.index:
        prefix = 'all_' + str(i)
        vecs[n] = np.append(model1.docvecs[prefix],model2.docvecs[prefix])
        n += 1
    return vecs

## Phrase Modeling

Another thing that can be implemented with Gensim library is phrase detection. It is similar to n-gram, but instead of getting all the n-gram by sliding the window, it detects frequently-used phrases and sticks them together.

Patrick Harrison has provided a nice explanation of Gensim's phrase modelling in PyData DC 2016.

$$\frac {{count(A B)}-{count_{min}}} {{count(A)} \times {count(B)}} \times {N} > {threshold}$$

where:

- count(A) is the number of times token A appears in the corpus
- count(B) is the number of times token B appears in the corpus
- count(A B) is the number of times the tokens A B appear in the corpus in order
- N is the total size of the corpus vocabulary
- count_{min} is a user-defined parameter to ensure that accepted phrases occur a minimum number of times
- threshold is a user-defined parameter to control how strong of a relationship between two tokens the model requires before accepting them as a phrase (default threshold used in Gensim's Phrases function is 10.0)

OK let's see how this actually works.

In [17]:
from gensim.models.phrases import Phrases
from gensim.models.phrases import Phraser

In [18]:
tokenized_train = [t.split() for t in x_train]

By feeding all the tokenized tweets corpus, it will detect the frequently used phrase and connect them together with underbar in the middle.

In [19]:
%%time
phrases = Phrases(tokenized_train)
bigram = Phraser(phrases)

CPU times: user 3min 9s, sys: 5.78 s, total: 3min 14s
Wall time: 3min 13s


In [20]:
sent = [u'the', u'mayor', u'of', u'new', u'york', u'was', u'there']
print(bigram[sent])

[u'the', u'mayor', u'of', u'new_york', u'was', u'there']


As you can see from the above example, with the tweets corpus it has learned "New York" as a frequently used phrase. So now feeding the "bigram" with tokens separated "new" and "york", it will automatically put them together into one word as "new_york".

In [21]:
x_train[627092]

'last time with nutella and vanilla ice cream sadface'

In [22]:
bigram[x_train[627092].split()]

[u'last',
 u'time',
 u'with',
 u'nutella',
 u'and',
 u'vanilla_ice',
 u'cream',
 u'sadface']

If we check with one of the tweets from the corpus, we can see that the bigram model has recognised "vanilla_ice" as a phrase. This is interesting, and I will come back to this later.

Now let's transform our corpus with this bigram model.

In [26]:
def labelize_tweets_bg(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(bigram[t.split()], [prefix + '_%s' % i]))
    return result

In [27]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v_bg = labelize_tweets_bg(all_x, 'all')

After I get the corpus with bigram phrases detected, I went over the same process of Doc2Vec I did with unigram.

## DBOW Bigram

In [45]:
cores = multiprocessing.cpu_count()
model_bg_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dbow.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 1043062.41it/s]


In [46]:
%%time
for epoch in range(30):
    model_bg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dbow.alpha -= 0.002
    model_bg_dbow.min_alpha = model_bg_dbow.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1316773.39it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1418144.08it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1335872.41it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1364295.81it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1321912.54it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1313759.16it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1145635.32it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1396075.76it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1365013.55it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1260398.87it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1416309.33it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1304179.97it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1358849.28it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1246421.60it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1298254.09it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1272354.

CPU times: user 39min 55s, sys: 17min 8s, total: 57min 4s
Wall time: 36min 18s


In [47]:
train_vecs_dbow_bg = get_vectors(model_bg_dbow, x_train, 100)
validation_vecs_dbow_bg = get_vectors(model_bg_dbow, x_validation, 100)

In [48]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_bg, y_train)

CPU times: user 23.3 s, sys: 7.28 s, total: 30.6 s
Wall time: 32.5 s


In [49]:
clf.score(validation_vecs_dbow_bg, y_validation)

0.73790726817042607

In [50]:
model_bg_dbow.save('d2v_model_bg_dbow.doc2vec')
model_bg_dbow = Doc2Vec.load('d2v_model_bg_dbow.doc2vec')
model_bg_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMC Bigram

In [51]:
cores = multiprocessing.cpu_count()
model_bg_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmc.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 1248968.03it/s]


In [52]:
%%time
for epoch in range(30):
    model_bg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmc.alpha -= 0.002
    model_bg_dmc.min_alpha = model_bg_dmc.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1203649.63it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1350325.81it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1350213.87it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1297423.50it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1385883.24it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1244686.46it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1425682.66it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1312333.11it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1295467.84it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1287446.27it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1355128.59it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1261521.40it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1364672.94it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1339324.22it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1403120.92it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1253678.

CPU times: user 48min 8s, sys: 17min 9s, total: 1h 5min 17s
Wall time: 36min 18s


In [181]:
model_bg_dmc.most_similar('new_york')

[(u'ny', 0.7682234644889832),
 (u'chicago', 0.7522180676460266),
 (u'berlin', 0.7467372417449951),
 (u'texas', 0.7409263253211975),
 (u'paris', 0.7380017638206482),
 (u'nashville', 0.7352598309516907),
 (u'nyc', 0.7345788478851318),
 (u'london', 0.7340636253356934),
 (u'boston', 0.7281099557876587),
 (u'florida', 0.726203203201294)]

Since now we have bigram phrase detected corpus, if we look for the most similar words to "new_york", the most similar word for 'new_york' is 'ny' which is pretty amazing, and you can also see other city names as 'chicago', 'berlin', etc.

In [53]:
train_vecs_dmc_bg = get_vectors(model_bg_dmc, x_train, 100)
validation_vecs_dmc_bg = get_vectors(model_bg_dmc, x_validation, 100)

In [54]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dmc_bg, y_train)

CPU times: user 14.1 s, sys: 7.93 s, total: 22 s
Wall time: 25.1 s


In [55]:
clf.score(validation_vecs_dmc_bg, y_validation)

0.64974937343358397

In [56]:
model_bg_dmc.save('d2v_model_bg_dmc.doc2vec')
model_bg_dmc = Doc2Vec.load('d2v_model_bg_dmc.doc2vec')
model_bg_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMM Bigram

In [57]:
cores = multiprocessing.cpu_count()
model_bg_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_bg_dmm.build_vocab([x for x in tqdm(all_x_w2v_bg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 1015076.49it/s]


In [58]:
%%time
for epoch in range(30):
    model_bg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_bg)]), total_examples=len(all_x_w2v_bg), epochs=1)
    model_bg_dmm.alpha -= 0.002
    model_bg_dmm.min_alpha = model_bg_dms.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1177680.86it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1298185.36it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1351305.73it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1407373.19it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1402785.14it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1301223.06it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1299343.96it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1342313.83it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1390038.74it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1315512.42it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1342472.92it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1242250.79it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1194568.54it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1254366.95it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1241731.86it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1280165.

CPU times: user 52min 40s, sys: 21min 27s, total: 1h 14min 7s
Wall time: 46min 30s


In [59]:
train_vecs_dmm_bg = get_vectors(model_bg_dmm, x_train, 100)
validation_vecs_dmm_bg = get_vectors(model_bg_dmm, x_validation, 100)

In [60]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dmm_bg, y_train)

CPU times: user 24.3 s, sys: 8.41 s, total: 32.7 s
Wall time: 36.2 s


In [61]:
clf.score(validation_vecs_dmm_bg, y_validation)

0.72863408521303263

In [62]:
model_bg_dmm.save('d2v_model_bg_dmm.doc2vec')
model_bg_dmm = Doc2Vec.load('d2v_model_bg_dmm.doc2vec')
model_bg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [68]:
train_vecs_dbow_dmc_bg = get_concat_vectors(model_bg_dbow,model_bg_dmc, x_train, 200)
validation_vecs_dbow_dmc_bg = get_concat_vectors(model_bg_dbow,model_bg_dmc, x_validation, 200)

In [69]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc_bg, y_train)

CPU times: user 1min 30s, sys: 5min 26s, total: 6min 56s
Wall time: 12min 48s


In [70]:
clf.score(validation_vecs_dbow_dmc_bg, y_validation)

0.74517543859649127

In [71]:
train_vecs_dbow_dmm_bg = get_concat_vectors(model_bg_dbow,model_bg_dmm, x_train, 200)
validation_vecs_dbow_dmm_bg = get_concat_vectors(model_bg_dbow,model_bg_dmm, x_validation, 200)

In [72]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm_bg, y_train)

CPU times: user 57.3 s, sys: 1min 24s, total: 2min 21s
Wall time: 3min 59s


In [73]:
clf.score(validation_vecs_dbow_dmm_bg, y_validation)

0.75369674185463664

## Trigram

And if we run the same phrase detection again on bigram detected corpus, now it will detect trigram phrases.

In [75]:
%%time
tg_phrases = Phrases(bigram[tokenized_train])
trigram = Phraser(tg_phrases)

CPU times: user 5min 59s, sys: 14.8 s, total: 6min 14s
Wall time: 6min 9s


In [27]:
trigram[bigram[x_train[627092].split()]]

[u'last',
 u'time',
 u'with',
 u'nutella',
 u'and',
 u'vanilla_ice_cream',
 u'sadface']

Do you remember that we saw the bigram model detected "vanilla_ice" with the above data entry? Now the trigram phrase modelling has detected "vanilla_ice_cream" as one word!

Below Doc2Vec implementation is again same as unigram or bigram.

In [76]:
def labelize_tweets_tg(tweets,label):
    result = []
    prefix = label
    for i, t in zip(tweets.index, tweets):
        result.append(LabeledSentence(trigram[bigram[t.split()]], [prefix + '_%s' % i]))
    return result

In [77]:
all_x = pd.concat([x_train,x_validation,x_test])
all_x_w2v_tg = labelize_tweets_tg(all_x, 'all')

## DBOW Trigram

In [84]:
model_tg_dbow = Doc2Vec(dm=0, size=100, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dbow.build_vocab([x for x in tqdm(all_x_w2v_tg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 964801.43it/s]


In [85]:
%%time
for epoch in range(30):
    model_tg_dbow.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dbow.alpha -= 0.002
    model_tg_dbow.min_alpha = model_tg_dbow.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1140064.75it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1411727.82it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1400758.88it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1366421.73it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1370121.48it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1339086.85it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1122056.65it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1352223.71it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1356885.71it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1359952.95it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1394955.73it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1285871.94it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1269809.89it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1382792.32it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1291266.98it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1264850.

CPU times: user 39min 9s, sys: 17min 47s, total: 56min 56s
Wall time: 35min 16s


In [86]:
train_vecs_dbow_tg = get_vectors(model_tg_dbow, x_train, 100)
validation_vecs_dbow_tg = get_vectors(model_tg_dbow, x_validation, 100)

In [87]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_tg, y_train)

CPU times: user 24 s, sys: 8.83 s, total: 32.8 s
Wall time: 37.2 s


In [88]:
clf.score(validation_vecs_dbow_tg, y_validation)

0.73684210526315785

In [89]:
model_tg_dbow.save('d2v_model_tg_dbow.doc2vec')
model_tg_dbow = Doc2Vec.load('d2v_model_tg_dbow.doc2vec')
model_tg_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMC Trigram

In [90]:
cores = multiprocessing.cpu_count()
model_tg_dmc = Doc2Vec(dm=1, dm_concat=1, size=100, window=2, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dmc.build_vocab([x for x in tqdm(all_x_w2v_tg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 1078282.64it/s]


In [91]:
%%time
for epoch in range(30):
    model_tg_dmc.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dmc.alpha -= 0.002
    model_tg_dmc.min_alpha = model_tg_dmc.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1145472.61it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1300630.20it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1256518.95it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1209643.28it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1193364.50it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1319026.26it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1281172.21it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1255106.84it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1317832.32it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1416197.57it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1511951.37it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1418413.92it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1469892.19it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1452020.76it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1497197.89it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1518422.

CPU times: user 50min 45s, sys: 16min 55s, total: 1h 7min 41s
Wall time: 39min 17s


In [92]:
train_vecs_dmc_tg = get_vectors(model_tg_dmc, x_train, 100)
validation_vecs_dmc_tg = get_vectors(model_tg_dmc, x_validation, 100)

In [93]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dmc_tg, y_train)

CPU times: user 16.8 s, sys: 13 s, total: 29.7 s
Wall time: 36.9 s


In [94]:
clf.score(validation_vecs_dmc_tg, y_validation)

0.65507518796992481

In [95]:
model_tg_dmc.save('d2v_model_tg_dmc.doc2vec')
model_tg_dmc = Doc2Vec.load('d2v_model_tg_dmc.doc2vec')
model_tg_dmc.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

## DMM Trigram

In [96]:
cores = multiprocessing.cpu_count()
model_tg_dmm = Doc2Vec(dm=1, dm_mean=1, size=100, window=4, negative=5, min_count=2, workers=cores, alpha=0.065, min_alpha=0.065)
model_tg_dmm.build_vocab([x for x in tqdm(all_x_w2v_tg)])

100%|██████████| 1596019/1596019 [00:01<00:00, 884827.48it/s]


In [97]:
%%time
for epoch in range(30):
    model_tg_dmm.train(utils.shuffle([x for x in tqdm(all_x_w2v_tg)]), total_examples=len(all_x_w2v_tg), epochs=1)
    model_tg_dmm.alpha -= 0.002
    model_tg_dmc.min_alpha = model_tg_dmc.alpha

100%|██████████| 1596019/1596019 [00:01<00:00, 1012449.86it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1502780.28it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1470238.58it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1473524.71it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1525361.41it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1527416.60it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1470776.42it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1486240.76it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1272797.17it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1473264.95it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1486890.10it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1486375.07it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1498297.03it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1459908.90it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1541045.44it/s]
100%|██████████| 1596019/1596019 [00:01<00:00, 1510792.

CPU times: user 52min 31s, sys: 20min 33s, total: 1h 13min 4s
Wall time: 47min 52s


In [98]:
train_vecs_dmm_tg = get_vectors(model_tg_dmm, x_train, 100)
validation_vecs_dmm_tg = get_vectors(model_tg_dmm, x_validation, 100)

In [99]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dmm_tg, y_train)

CPU times: user 24.5 s, sys: 12.6 s, total: 37.2 s
Wall time: 43.7 s


In [100]:
clf.score(validation_vecs_dmm_tg, y_validation)

0.73840852130325818

In [101]:
model_tg_dmm.save('d2v_model_tg_dmm.doc2vec')
model_tg_dmm = Doc2Vec.load('d2v_model_tg_dmm.doc2vec')
model_tg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [105]:
train_vecs_dbow_dmc_tg = get_concat_vectors(model_tg_dbow,model_tg_dmc, x_train, 200)
validation_vecs_dbow_dmc_tg = get_concat_vectors(model_tg_dbow,model_tg_dmc, x_validation, 200)

In [106]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmc_tg, y_train)

CPU times: user 53.2 s, sys: 1min 3s, total: 1min 56s
Wall time: 3min 20s


In [107]:
clf.score(validation_vecs_dbow_dmc_tg, y_validation)

0.7461152882205514

In [108]:
train_vecs_dbow_dmm_tg = get_concat_vectors(model_tg_dbow,model_tg_dmm, x_train, 200)
validation_vecs_dbow_dmm_tg = get_concat_vectors(model_tg_dbow,model_tg_dmm, x_validation, 200)

In [109]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_dbow_dmm_tg, y_train)

CPU times: user 2min 58s, sys: 15min 27s, total: 18min 26s
Wall time: 39min 19s


In [110]:
clf.score(validation_vecs_dbow_dmm_tg, y_validation)

0.75657894736842102

Since we now have all the result from unigram to trigram and different Doc2Vec models, we can take a look at these results in a table format.

**Validation set accuracy comparison of different Doc2Vec modeling:**
*(classifier used for validation: logistic regression with default setting)*

|            | unigram | bigram | trigram | best result      |
|------------|---------|--------|---------|------------------|
| DBOW       |  73.89% | 73.79% |  73.68% | 73.89% (unigram) |
|  DMC       |  66.47% | 64.97% |  65.50% | 66.47% (unigram) |
|  DMM       |  72.56% | 72.86% |  73.84% | 73.84% (trigram) |
| dbow + dmc |  74.58% | 74.52% |  74.61% | 74.61% (trigram) |
| dbow + dmm |  75.51% | 75.37% |  75.65% | 75.65% (trigram) |

The best validation accuracy I can get was from dbow+dmm model.

DMM model tends to perform better with increased n-gram, while pure DBOW model tends to perform worse with increased n-gram. In terms of a joint model, two models performance got lower with bigram and got higher with trigram.

Before I move on to next step, I would like to try one more thing, which is creating joint vectors across different n-grams. By looking at the above table, for DBOW model unigram performed the best, so I will use vectors from unigram DBOW model and join this together with trigram DMM vectors.

In [16]:
model_ug_dbow = Doc2Vec.load('d2v_model_ug_dbow.doc2vec')
model_tg_dmm = Doc2Vec.load('d2v_model_tg_dmm.doc2vec')
model_ug_dbow.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)
model_tg_dmm.delete_temporary_training_data(keep_doctags_vectors=True, keep_inference=True)

In [18]:
train_vecs_ugdbow_tgdmm = get_concat_vectors(model_ug_dbow,model_tg_dmm, x_train, 200)
validation_vecs_ugdbow_tgdmm = get_concat_vectors(model_ug_dbow,model_tg_dmm, x_validation, 200)

In [19]:
%%time
clf = LogisticRegression()
clf.fit(train_vecs_ugdbow_tgdmm, y_train)

CPU times: user 1min 10s, sys: 43.6 s, total: 1min 54s
Wall time: 2min 34s


In [20]:
clf.score(validation_vecs_ugdbow_tgdmm, y_validation)

0.75764411027568923

The result is,

unigram DBOW + trigram DMM: 75.76%

In [21]:
from sklearn.preprocessing import MinMaxScaler

mmscaler = MinMaxScaler()
d2v_ugdbow_tgdmm_mm = mmscaler.fit_transform(train_vecs_ugdbow_tgdmm)
d2v_ugdbow_tgdmm_mm_val = mmscaler.fit_transform(validation_vecs_ugdbow_tgdmm)

In [25]:
names1 = ["Logistic Regression", "Multinomial NB", 
         "Bernoulli NB", "Ridge Classifier", "Perceptron","Passive-Aggresive", "Nearest Centroid"]
classifiers1 = [
    LogisticRegression(),
    MultinomialNB(),
    BernoulliNB(),
    RidgeClassifier(),
    Perceptron(),
    PassiveAggressiveClassifier(),
    NearestCentroid()
    ]
zipped_clf1 = zip(names1,classifiers1)

In [26]:
def classifier_comparator_d2v(train_vectors,validation_vectors, classifier=zipped_clf1):
    result = []
    for n,c in classifier:
        checker_pipeline = Pipeline([
            ('classifier', c)
        ])
        print "Validation result for {}".format(n)
        print c
        clf_accuracy,tt_time = accuracy_summary(checker_pipeline, train_vectors, y_train, validation_vectors, y_validation)
        result.append((n,clf_accuracy,tt_time))
    return result

In [27]:
classifier_comparator_d2v(d2v_ugdbow_tgdmm_mm,d2v_ugdbow_tgdmm_mm_val)

Validation result for Logistic Regression
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)
null accuracy: 50.40%
accuracy score: 75.68%
model is 25.28% more accurate than null accuracy
train and test time: 154.56s
--------------------------------------------------------------------------------
Validation result for Multinomial NB
MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)
null accuracy: 50.40%
accuracy score: 73.04%
model is 22.64% more accurate than null accuracy
train and test time: 8.22s
--------------------------------------------------------------------------------
Validation result for Bernoulli NB
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True)
null accuracy: 50.40%
accuracy score: 50.41%
model is 0.01% more accurate than null a

[('Logistic Regression', 0.75676691729323309, 154.56316304206848),
 ('Multinomial NB', 0.73038847117794481, 8.222593069076538),
 ('Bernoulli NB', 0.50407268170426067, 16.526820182800293),
 ('Ridge Classifier', 0.7555764411027569, 20.15756106376648),
 ('Perceptron', 0.67919799498746869, 7.3240861892700195),
 ('Passive-Aggresive', 0.62161654135338351, 7.885602951049805),
 ('Nearest Centroid', 0.72850877192982455, 1.8390729427337646)]