In [125]:
import pandas as pd
import gensim
import ast
import nltk
import multiprocessing
import numpy as np

from gensim.models import Doc2Vec
from gensim.test.test_doc2vec import ConcatenatedDoc2Vec
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import TaggedDocument
from nltk.corpus import stopwords
from tqdm import tqdm
from sklearn import utils
from sklearn.metrics import confusion_matrix, classification_report

In [42]:
uk_pol_tokens = pd.read_csv('uk_pol_tokens.csv', converters={4:ast.literal_eval})

In [43]:
uk_pol_tokens.head(1)

Unnamed: 0,speaker,party,date,subject,tokenised
0,Mark Isherwood,Conservative,2006-01-10,Fuel Poverty,"[chartered, institute, housing, states, reason..."


In [45]:
train_df, test_df = train_test_split(uk_pol_tokens, stratify=uk_pol_tokens['party'], test_size=0.3, random_state=1)

In [46]:
train_tagged = train_df.apply(lambda x: TaggedDocument(words=x.tokenised, tags=[x.party]), axis=1)
test_tagged = test_df.apply(lambda x: TaggedDocument(words=x.tokenised, tags=[x.party]), axis=1)

In [57]:
cores = multiprocessing.cpu_count()

First a distributed bag of words model:

In [59]:
model_dbow = Doc2Vec(dm=0, vector_size=300, negative=5, hs=0, min_count=2, sample = 0, workers=cores)
model_dbow.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 1675/1675 [00:00<00:00, 878841.53it/s]


In [62]:
%%time
for epoch in range(30):
    model_dbow.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dbow.alpha -= 0.002
    model_dbow.min_alpha = model_dbow.alpha

100%|██████████| 1675/1675 [00:00<00:00, 635155.88it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1349492.74it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1548139.97it/s]
100%|██████████| 1675/1675 [00:00<00:00, 897134.36it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1360731.98it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1494142.75it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1604352.41it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1575921.76it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1315629.06it/s]
100%|██████████| 1675/1675 [00:00<00:00, 871969.62it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1334623.71it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1519675.36it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1360731.98it/s]
100%|██████████| 1675/1675 [00:00<00:00, 884039.16it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1301251.94it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1344328.21it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1193182.61it/s]
100%|██████████| 1675/1675 [00:00<0

CPU times: user 2min 37s, sys: 1.62 s, total: 2min 38s
Wall time: 48.6 s


In [120]:
def vec_for_learning(model, tagged_docs):
    sents = tagged_docs.values
    targets, regressors = zip(*[(doc.tags[0], model.infer_vector(doc.words, steps=20)) for doc in sents])
    return targets, regressors

In [106]:
y_train, X_train = vec_for_learning(model_dbow, train_tagged)
y_test, X_test = vec_for_learning(model_dbow, test_tagged)

In [107]:
logreg = LogisticRegression(C=100000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)



In [108]:
y_pred = logreg.predict(X_test)

In [109]:
labels = pd.Series(y_test).unique()

In [110]:
print(logreg.score(X_train, y_train))
print(logreg.score(X_test, y_test))
print()
print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred,
                              labels=labels),
             columns=labels,
             index=labels)

0.751044776119403
0.6518105849582173

              precision    recall  f1-score   support

Conservative       0.72      0.84      0.78       517
      Labour       0.28      0.16      0.20       201

   micro avg       0.65      0.65      0.65       718
   macro avg       0.50      0.50      0.49       718
weighted avg       0.60      0.65      0.62       718



Unnamed: 0,Conservative,Labour
Conservative,436,81
Labour,169,32


Results are pretty poor. Try a distributed memory model:

In [116]:
model_dmm = Doc2Vec(dm=1, dm_mean=1, vector_size=300, window=10, negative=5, min_count=1, workers=5, alpha=0.065, min_alpha=0.065)
model_dmm.build_vocab([x for x in tqdm(train_tagged.values)])

100%|██████████| 1675/1675 [00:00<00:00, 755832.08it/s]


In [117]:
%%time
for epoch in range(30):
    model_dmm.train(utils.shuffle([x for x in tqdm(train_tagged.values)]), total_examples=len(train_tagged.values), epochs=1)
    model_dmm.alpha -= 0.002
    model_dmm.min_alpha = model_dmm.alpha

100%|██████████| 1675/1675 [00:00<00:00, 828278.61it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1230593.66it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1541008.82it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1169740.13it/s]
100%|██████████| 1675/1675 [00:00<00:00, 878182.40it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1461505.97it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1269049.71it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1686784.92it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1639164.54it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1266076.63it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1246090.67it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1673525.30it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1638782.18it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1702316.26it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1460594.43it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1606186.37it/s]
100%|██████████| 1675/1675 [00:00<00:00, 1215477.37it/s]
100%|██████████| 1675/1675 [00:00

CPU times: user 4min 39s, sys: 2.83 s, total: 4min 42s
Wall time: 1min 32s


In [118]:
y_train, X_train = vec_for_learning(model_dmm, train_tagged)
y_test, X_test = vec_for_learning(model_dmm, test_tagged)
logreg = LogisticRegression(C=100000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(logreg.score(X_train, y_train))
print(logreg.score(X_test, y_test))
print()
print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred,
                              labels=labels),
             columns=labels,
             index=labels)

1.0
0.8398328690807799

              precision    recall  f1-score   support

Conservative       0.88      0.91      0.89       517
      Labour       0.74      0.67      0.70       201

   micro avg       0.84      0.84      0.84       718
   macro avg       0.81      0.79      0.80       718
weighted avg       0.84      0.84      0.84       718





Unnamed: 0,Conservative,Labour
Conservative,469,48
Labour,67,134


Results are much better, although not better than the count vectorised logreg model. Try concatenating the two models:

In [119]:
new_model = ConcatenatedDoc2Vec([model_dbow, model_dmm])

In [121]:
y_train, X_train = vec_for_learning(new_model, train_tagged)
y_test, X_test = vec_for_learning(new_model, test_tagged)
logreg = LogisticRegression(C=100000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(logreg.score(X_train, y_train))
print(logreg.score(X_test, y_test))
print()
print(classification_report(y_test, y_pred))

pd.DataFrame(confusion_matrix(y_test, y_pred,
                              labels=labels),
             columns=labels,
             index=labels)



1.0
0.841225626740947

              precision    recall  f1-score   support

Conservative       0.89      0.89      0.89       517
      Labour       0.71      0.72      0.72       201

   micro avg       0.84      0.84      0.84       718
   macro avg       0.80      0.80      0.80       718
weighted avg       0.84      0.84      0.84       718



Unnamed: 0,Conservative,Labour
Conservative,459,58
Labour,56,145


Using doc2vec hasn't given us any stronger results than just using a logistic regression with individual words.