In [16]:

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer, one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D, MaxPooling1D, Flatten


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
model = Sequential()
model.add(Embedding(vocab_size, 100, input_length=max_length))
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(Conv1D(filters=16, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())

Model: "sequential_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 100)          904200    
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 93, 32)            25632     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 46, 32)            0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 1472)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 10)                14730     
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 11        
Total params: 944,573
Trainable params: 944,573
Non-trainable params: 0
________________________________________________

In [0]:
df = pd.read_csv('data/prepped/prepped.csv')
df['postText'] = df['postText'].transform(lambda x: np.str_(x))

In [3]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['postText'])
max_length = 100 #max(len(p) for p in df['postText'])
vocab_size = 9042 #len(tokenizer.word_index) + 1
encoded_docs = [one_hot(d, vocab_size) for d in df['postText']]
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(f'max length {max_length}')
print(f'vocab size {vocab_size}')

max length 100
vocab size 9042


In [4]:
embeddings_index = dict()
f = open('/content/drive/My Drive/ipython notebooks/Embeddings/GloVe/glove.6B.100d.txt')
for line in f:
	values = line.split()
	word = values[0]
	coefs = np.asarray(values[1:], dtype='float32')
	embeddings_index[word] = coefs
f.close()
print('Loaded %s word vectors.' % len(embeddings_index))

Loaded 400000 word vectors.


In [0]:
embedding_matrix = np.zeros((vocab_size, 100))

for word, i in tokenizer.word_index.items():
    if i >= vocab_size:
        break
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

X_train, X_test, y_train, y_test = train_test_split(padded_docs, df['truthMean'], test_size=0.3, random_state=42, stratify=np.round(df['truthMean']))
y_train_bool = np.round(y_train)
y_test_bool = np.round(y_test)

In [20]:
# set parameters:
batch_size = 64
embedding_dims = 100
filters = 250
kernel_size = 3
hidden_dims = 100
epochs = 5

filters_2 = 125

cnn_glove = Sequential()

cnn_glove.add(Embedding(vocab_size, embedding_dims, weights=[embedding_matrix], trainable=True, input_length=max_length))

cnn_glove.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
cnn_glove.add(Conv1D(filters_2, kernel_size, padding='valid', activation='relu', strides=1))
cnn_glove.add(GlobalMaxPooling1D())

cnn_glove.add(Dense(hidden_dims))
cnn_glove.add(Dropout(0.2))
cnn_glove.add(Activation('relu'))

cnn_glove.add(Dense(1))
cnn_glove.add(Activation('sigmoid'))
cnn_glove.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error', 'accuracy'])
print(cnn_glove.summary())

Model: "sequential_10"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 100)          904200    
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 98, 250)           75250     
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 96, 125)           93875     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 125)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               12600     
_________________________________________________________________
dropout_6 (Dropout)          (None, 100)               0         
_________________________________________________________________
activation_5 (Activation)    (None, 100)             

In [0]:
cnn_glove.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=[X_test, y_test])

Train on 15397 samples, validate on 6600 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f130a17dcf8>

In [0]:
y_pred_bool = np.round(cnn_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')

              precision    recall  f1-score   support

         0.0       0.90      0.72      0.80      4965
         1.0       0.47      0.76      0.58      1635

    accuracy                           0.73      6600
   macro avg       0.69      0.74      0.69      6600
weighted avg       0.79      0.73      0.75      6600

Accuracy: 0.7289393939393939


In [0]:
cnn_no_glove = Sequential()
cnn_no_glove.add(Embedding(vocab_size, embedding_dims, weights=[embedding_matrix], trainable=True, input_length=max_length))
cnn_no_glove.add(Dropout(0.2))
cnn_no_glove.add(Conv1D(filters_2, kernel_size, padding='valid', activation='relu', strides=1))
cnn_no_glove.add(GlobalMaxPooling1D())
cnn_no_glove.add(Dense(hidden_dims))
cnn_no_glove.add(Dropout(0.2))
cnn_no_glove.add(Activation('relu'))
cnn_no_glove.add(Dense(1))
cnn_no_glove.add(Activation('sigmoid'))
cnn_no_glove.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_squared_error', 'accuracy'])
print(cnn_no_glove.summary())

Model: "sequential_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 100)          904200    
_________________________________________________________________
dropout_7 (Dropout)          (None, 100, 100)          0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 98, 125)           37625     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 125)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 100)               12600     
_________________________________________________________________
dropout_8 (Dropout)          (None, 100)               0         
_________________________________________________________________
activation_7 (Activation)    (None, 100)              

In [0]:
cnn_no_glove.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=[X_test, y_test])

Train on 15397 samples, validate on 6600 samples
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


<keras.callbacks.History at 0x7f13110a9630>

In [0]:
y_pred_bool = np.round(cnn_no_glove.predict(X_test))
print(classification_report(y_test_bool, y_pred_bool))
print(f'Accuracy: {accuracy_score(y_test_bool, y_pred_bool)}')

              precision    recall  f1-score   support

         0.0       0.90      0.68      0.78      4965
         1.0       0.45      0.77      0.57      1635

    accuracy                           0.71      6600
   macro avg       0.67      0.73      0.67      6600
weighted avg       0.79      0.71      0.73      6600

Accuracy: 0.7059090909090909
