In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.utils import class_weight

In [2]:
df = pd.read_csv('labeled_data.csv', index_col=0)
df.head(5)

Unnamed: 0,sentence,prediction
0,twitter prior musk takeover talking directly n...,1
1,article say imply states feature turns crashes...,1
2,og musk duck lives wall,1
3,dare_speak way great powerful musk obviously d...,1
4,cannot_wait finally excuse shower douche,1


In [3]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

In [4]:
for label, count in dict(df['prediction'].value_counts()).items():
    print(f'{label}: {count / df.shape[0]}')

1: 0.9035761589403973
0: 0.09642384105960265


In [5]:
df['prediction'] = df['prediction'].astype('int8')

In [6]:
df[df['prediction'] == 0].shape

(5824, 2)

In [7]:
# limit the dataset to fit in memory
limited_df = pd.concat([df[df['prediction'] == 0], df[df['prediction'] == 1][:10000]])
limited_df.shape

(15824, 2)

In [8]:
limited_df.sample(5)

Unnamed: 0,sentence,prediction
102,vaguely elon_musk aligned clickbait rich - guy,1
4870,context calling musk bully,1
3320,dm_matter old may open good ideas good referen...,0
700,tl_; dr : maybe really understand point podcas...,1
5335,musk become fat oligarch,1


In [9]:
x_train, x_test, y_train, y_test = train_test_split(
    limited_df['sentence'], limited_df['prediction'], test_size=0.1, stratify=limited_df['prediction'])

x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.33, stratify=y_train
)

In [10]:
for label, count in dict(y_test.value_counts()).items():
    print(f'{label}: {count / y_test.shape[0]}')

1: 0.6317119393556538
0: 0.36828806064434616


In [11]:
for label, count in dict(y_val.value_counts()).items():
    print(f'{label}: {count / y_val.shape[0]}')

1: 0.6319148936170212
0: 0.3680851063829787


In [12]:
for label, count in dict(y_test.value_counts()).items():
    print(f'{label}: {count / y_test.shape[0]}')

1: 0.6317119393556538
0: 0.36828806064434616


In [13]:
y_train.value_counts(), y_val.value_counts(), y_test.value_counts()

(1    6030
 0    3511
 Name: prediction, dtype: int64,
 1    2970
 0    1730
 Name: prediction, dtype: int64,
 1    1000
 0     583
 Name: prediction, dtype: int64)

In [14]:
MAX_FEATURES = 5000

count_vec = CountVectorizer(max_features=MAX_FEATURES, min_df=5)
x_train = count_vec.fit_transform(x_train).toarray()
x_val = count_vec.transform(x_val).toarray()
x_test = count_vec.transform(x_test).toarray()

In [15]:
import tensorflow as tf

with tf.device('/cpu:0'):
    x_train = tf.convert_to_tensor(x_train, np.int8)
    x_val = tf.convert_to_tensor(x_val, np.int8)
    x_test = tf.convert_to_tensor(x_test, np.int8)

In [16]:
x_train.shape

TensorShape([9541, 5000])

In [17]:
from keras.layers import Dense, Dropout, LSTM
from keras.models import Model, Sequential
from keras.callbacks import EarlyStopping
from keras.metrics import Recall

early_stopping = EarlyStopping(patience=5, restore_best_weights=True)

In [18]:
class_weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights = dict(zip(np.unique(y_train), class_weights))
class_weights

{0: 1.358729706636286, 1: 0.7911276948590381}

In [19]:
from keras.backend import clear_session

EPOCHS = 25
BATCH_SIZE = 8

def compile_and_fit_model(model: Model, model_name: str):
    clear_session()

    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy', Recall()]  # class imbalance
    )

    model_history = model.fit(
        x_train, y_train,
        validation_data=(x_val, y_val),
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks=[early_stopping],
        class_weight=class_weights
    )

    model.save(model_name + '.h5')

    return model_history.history

In [20]:
def display_confusion_matrix(model_prediction):
    model_prediction = np.argmax(model_prediction, axis=1)
    print(confusion_matrix(y_test, model_prediction, normalize='pred'))

In [21]:
def display_classification_report(model_prediction):
    model_prediction = np.argmax(model_prediction, axis=1)
    print(classification_report(y_test, model_prediction, zero_division=0))

In [22]:
def display_model_results(model: Model):
    prediction = model.predict(x_test)

    print('Confussion matrix:')
    display_confusion_matrix(prediction)
    
    print('\nClassification report:')
    display_classification_report(prediction)

    print('\nModel evaluation on test data [loss, accuracy, recall]:')
    print(model.evaluate(x_test, y_test))

In [23]:
dense_model1 = Sequential([
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid'),
])

dense_model1_history = compile_and_fit_model(dense_model1, 'dense_model1')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [24]:
display_model_results(dense_model1)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.22786106169223785, 0.9109286069869995, 0.9150000214576721]


In [25]:
dense_model2 = Sequential([
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid'),
])

dense_model2_history = compile_and_fit_model(dense_model2, 'dense_model2')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [26]:
display_model_results(dense_model2)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.2183535099029541, 0.9172457456588745, 0.9380000233650208]


In [27]:
dense_model3 = Sequential([
    Dense(512, activation='relu'),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(1, activation='sigmoid')
])

dense_model3_history = compile_and_fit_model(dense_model3, 'dense_model3')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [28]:
display_model_results(dense_model3)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.24906308948993683, 0.9096651673316956, 0.9100000262260437]


In [40]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

tokenizer = Tokenizer(num_words=MAX_FEATURES, split=' ')
tokenizer.fit_on_texts(limited_df['sentence'].values)
X = tokenizer.texts_to_sequences(limited_df['sentence'].values)
X = pad_sequences(X, maxlen=MAX_FEATURES, padding='post')

In [41]:
x_train, x_test, y_train, y_test = train_test_split(
    X, limited_df['prediction'], test_size=0.1, stratify=limited_df['prediction'])

x_train, x_val, y_train, y_val = train_test_split(
    x_train, y_train, test_size=0.33, stratify=y_train)

In [42]:
with tf.device('/cpu:0'):
    x_train = tf.convert_to_tensor(x_train, np.int32)
    x_val = tf.convert_to_tensor(x_val, np.int32)
    x_test = tf.convert_to_tensor(x_test, np.int32)

In [43]:
embedding_dimension = 64
lstm_output = 64

In [45]:
from keras.layers import Embedding

lstm_model1 = Sequential([
    Embedding(MAX_FEATURES, embedding_dimension, input_length=X.shape[1]),
    LSTM(lstm_output, dropout=0.2),
    Dense(1, activation='sigmoid')
])

lstm_model1_history = compile_and_fit_model(lstm_model1, 'lstm_model1')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25


In [46]:
display_model_results(lstm_model1)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.6810417771339417, 0.6317119598388672, 1.0]


In [47]:
from keras.layers import Bidirectional

lstm_model2 = Sequential([
    Embedding(MAX_FEATURES, embedding_dimension, input_length=X.shape[1]),
    Bidirectional(LSTM(lstm_output, dropout=0.2)),
    Dense(1, activation='sigmoid')
])

lstm_model2_history = compile_and_fit_model(lstm_model2, 'lstm_model2')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [48]:
display_model_results(lstm_model2)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.1795957386493683, 0.9222994446754456, 0.9380000233650208]


In [49]:
embedding_dimension = 96
lstm_output = 128

In [50]:
lstm_model3 = Sequential([
    Embedding(MAX_FEATURES, embedding_dimension, input_length=X.shape[1]),
    Bidirectional(LSTM(lstm_output, dropout=0.2)),
    Dense(1, activation='sigmoid')
])

lstm_model3_history = compile_and_fit_model(lstm_model3, 'lstm_model3')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25


In [51]:
display_model_results(lstm_model3)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.17477327585220337, 0.9267213940620422, 0.9440000057220459]


In [52]:
lstm_model4 = Sequential([
    Embedding(MAX_FEATURES, embedding_dimension, input_length=X.shape[1]),
    Bidirectional(LSTM(lstm_output, dropout=0.3)),
    Dense(1, activation='sigmoid')
])

lstm_model4_history = compile_and_fit_model(lstm_model4, 'lstm_model4')

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25


In [53]:
display_model_results(lstm_model4)

Confussion matrix:
[[0.36828806 0.        ]
 [0.63171194 0.        ]]

Classification report:
              precision    recall  f1-score   support

           0       0.37      1.00      0.54       583
           1       0.00      0.00      0.00      1000

    accuracy                           0.37      1583
   macro avg       0.18      0.50      0.27      1583
weighted avg       0.14      0.37      0.20      1583


Model evaluation on test data [loss, accuracy, recall]:
[0.1690511852502823, 0.9273531436920166, 0.9620000123977661]
