In [1]:
import pandas as pd
import numpy as np
import ast

from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, classification_report

from keras.models import Sequential
from keras.layers import Dense

from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
uk_pol_tokens = pd.read_csv('uk_pol_tokens.csv', converters={5:ast.literal_eval})

In [3]:
train_df, test_df = train_test_split(uk_pol_tokens, stratify=uk_pol_tokens['party'], test_size=0.3, random_state=1)

In [4]:
stop = stopwords.words('english')
# add additional stopwords based on the strongest predictors from the weak model:
stop = stop + ['conservative','conservatives','tory','tories','labour','jeremy','corbyn','george','may','pdf','.']

In [5]:
X_train, y_train = train_df.tokenised, train_df.party
X_test, y_test = test_df.tokenised, test_df.party

In [6]:
y_train_bin = y_train.map(lambda x: 1 if x=='Conservative' else 0)
y_test_bin = y_test.map(lambda x: 1 if x=='Conservative' else 0)

In [7]:
vectorizer = CountVectorizer(binary=True, stop_words=stopwords.words('english'), 
                             lowercase=True, min_df=3, max_df=0.9, max_features=5000)

X_train_onehot = vectorizer.fit_transform(X_train)

In [23]:
model = Sequential()
 
model.add(Dense(units=1000, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=1000, activation='relu', input_dim=len(vectorizer.get_feature_names())))
model.add(Dense(units=1, activation='sigmoid'))
 
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_10 (Dense)             (None, 1000)              5001000   
_________________________________________________________________
dense_11 (Dense)             (None, 1000)              1001000   
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 1001      
Total params: 6,003,001
Trainable params: 6,003,001
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.fit(X_train_onehot[:-100], y_train_bin[:-100], 
          epochs=2, batch_size=128, verbose=1, 
          validation_data=(X_train_onehot[-100:], y_train_bin[-100:]))

Train on 1575 samples, validate on 100 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x129ce43c8>

In [25]:
scores = model.evaluate(vectorizer.transform(X_test), y_test_bin, verbose=1)
print("Accuracy:", scores[1]) 

Accuracy: 0.7743732585548359


In [26]:
predictions = pd.DataFrame(model.predict(vectorizer.transform(X_test))).applymap(lambda x: 'Conservative' if x>=0.5 else 'Labour')[0]

In [27]:
print(classification_report(test_df.party, predictions))

pd.DataFrame(confusion_matrix(test_df.party, predictions,
                              labels=test_df.party.unique()),
             columns=test_df.party.unique(),
             index=test_df.party.unique())

              precision    recall  f1-score   support

Conservative       0.92      0.75      0.83       517
      Labour       0.57      0.83      0.67       201

   micro avg       0.77      0.77      0.77       718
   macro avg       0.74      0.79      0.75       718
weighted avg       0.82      0.77      0.78       718



Unnamed: 0,Conservative,Labour
Conservative,389,128
Labour,34,167


Not sure I know enough about keras to improve on this model, will leave it here for now.