RNN with bidirectional GRU. Run on collab with TPU for faster speed.

In [0]:
import io
import time
import re
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import classification_report
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
import keras.backend as K
import tensorflow as tf
from tensorflow.keras import layers

!pip install tensorflow==2.0.0-beta1

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
df = pd.read_csv('data/prepped/prepped.csv')
df['postText'] = df['postText'].transform(lambda x: np.str_(x))

In [0]:
# Encode documents
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['postText'])
max_length = 100 #max(len(p) for p in df['postText'])
vocab_size = 9042 #len(tokenizer.word_index) + 1
encoded_docs = [one_hot(d, vocab_size) for d in df['postText']]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(f'max length {max_length}')
print(f'vocab size {vocab_size}')

max length 100
vocab size 9042


In [0]:
embeddings_index = dict()
f = open('/content/drive/My Drive/ipython notebooks/Embeddings/GloVe/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

X_train, X_test, y_train, y_test = train_test_split(padded_docs, df['truthMean'], test_size=0.3, random_state=42, stratify=np.round(df['truthMean']))
y_train_bool = np.round(y_train)
y_test_bool = np.round(y_test)

In [0]:
# bidirection GRU with pretrained 100d Glove embedding
model_gru_glove = tf.keras.Sequential()
model_gru_glove.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], trainable=True, input_length=max_length))
model_gru_glove.add(layers.Bidirectional(layers.GRU(16, return_sequences=False)))
model_gru_glove.add(layers.Flatten())
model_gru_glove.add(layers.Dense(1, activation='sigmoid'))
model_gru_glove.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse', 'accuracy'])
print(model_gru_glove.summary())


Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 100)          904200    
_________________________________________________________________
bidirectional_10 (Bidirectio (None, 32)                11328     
_________________________________________________________________
flatten_10 (Flatten)         (None, 32)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 33        
Total params: 915,561
Trainable params: 915,561
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model_gru_glove.fit(X_train, y_train, epochs = 5, validation_data=[X_test, y_test])

Train on 15397 samples, validate on 6600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7feca66f7240>

In [0]:
y_pred_bool = np.round(model_gru_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')


              precision    recall  f1-score   support

         0.0       0.86      0.90      0.88      4965
         1.0       0.65      0.57      0.61      1635

    accuracy                           0.82      6600
   macro avg       0.76      0.74      0.74      6600
weighted avg       0.81      0.82      0.81      6600

Accuracy: 0.8172727272727273


In [0]:
# bidirection GRU
model_gru_no_glove = tf.keras.Sequential()
model_gru_no_glove.add(layers.Embedding(vocab_size, 100, input_length=max_length))
model_gru_no_glove.add(layers.Bidirectional(layers.GRU(16, return_sequences=False)))
model_gru_no_glove.add(layers.Flatten())
model_gru_no_glove.add(layers.Dense(1, activation='sigmoid'))
model_gru_no_glove.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse', 'accuracy'])
print(model_gru_no_glove.summary())


Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          904200    
_________________________________________________________________
bidirectional_7 (Bidirection (None, 32)                11328     
_________________________________________________________________
flatten_7 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 33        
Total params: 915,561
Trainable params: 915,561
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
model_gru_no_glove.fit(X_train, y_train, epochs = 3, validation_data=[X_test, y_test])


Train on 15397 samples, validate on 6600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7feca40df940>

In [0]:
y_pred_bool = np.round(model_gru_no_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')


              precision    recall  f1-score   support

         0.0       0.85      0.90      0.87      4965
         1.0       0.63      0.51      0.57      1635

    accuracy                           0.81      6600
   macro avg       0.74      0.71      0.72      6600
weighted avg       0.79      0.81      0.80      6600

Accuracy: 0.805


In [0]:
# bidirection LSTM glove embedding
lstm_glove = tf.keras.Sequential()
lstm_glove.add(layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=True))
lstm_glove.add(layers.Bidirectional(layers.LSTM(16, return_sequences=False)))
lstm_glove.add(layers.Flatten())
lstm_glove.add(layers.Dense(1, activation='sigmoid'))
lstm_glove.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse', 'accuracy'])
print(lstm_glove.summary())

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 100)          904200    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 32)                14976     
_________________________________________________________________
flatten_2 (Flatten)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 919,209
Trainable params: 919,209
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
lstm_glove.fit(X_train, y_train, epochs = 3, validation_data=[X_test, y_test])

Train on 15397 samples, validate on 6600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fec98da74e0>

In [0]:
y_pred_bool = np.round(lstm_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')


              precision    recall  f1-score   support

         0.0       0.86      0.89      0.87      4965
         1.0       0.63      0.58      0.60      1635

    accuracy                           0.81      6600
   macro avg       0.74      0.73      0.74      6600
weighted avg       0.80      0.81      0.81      6600

Accuracy: 0.8093939393939394


In [0]:
# bidirection LSTM
lstm_no_glove = tf.keras.Sequential()
lstm_no_glove.add(layers.Embedding(vocab_size, 100, input_length=max_length))
lstm_no_glove.add(layers.Bidirectional(layers.LSTM(16, return_sequences=False)))
lstm_no_glove.add(layers.Flatten())
lstm_no_glove.add(layers.Dense(1, activation='sigmoid'))
lstm_no_glove.compile(optimizer='adam', loss='mean_squared_error', metrics=['mse', 'accuracy'])
print(lstm_no_glove.summary())


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          904200    
_________________________________________________________________
bidirectional (Bidirectional (None, 32)                14976     
_________________________________________________________________
flatten (Flatten)            (None, 32)                0         
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 919,209
Trainable params: 919,209
Non-trainable params: 0
_________________________________________________________________
None


In [0]:
lstm_no_glove.fit(X_train, y_train, epochs = 3, validation_data=[X_test, y_test])

Train on 15397 samples, validate on 6600 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7efd490210f0>

In [0]:
y_pred_bool = np.round(lstm_no_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')


ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-11-ffd5b2a4cd45>", line 1, in <module>
    y_pred_bool = np.round(lstm_no_glove.predict(X_test))
NameError: name 'X_test' is not defined

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/interactiveshell.py", line 1823, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'NameError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/usr/local/lib/python3.6/dist-packages/IPython/core/ultratb.py", line 1132, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/usr/local/lib/python3.6/dist-pa

NameError: ignored