## CNN with FastText pre-trained word embeddings
 

### Load and tokenize dataset

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import pandas as pd

# loading the dataset
data = pd.read_csv('new_preprocessed_friends_data.csv')
texts = data['text'].values
labels = data['label'].values

# text tokenization and padding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
max_length = max([len(seq) for seq in sequences])
X = pad_sequences(sequences, maxlen=max_length, padding='post')

### Load and FastText word embeddings
downloaded at: https://fasttext.cc/docs/en/crawl-vectors.html

In [None]:
import os
import numpy as np
from gensim.models.fasttext import load_facebook_vectors

# path to potentially existing embedding matrix created in erlier runs
file_path = 'embedding_matrix.npy'

# load the embedding matrix if it already exists, to save time (takes 10 minutes to create)
if os.path.exists(file_path):
    # Load the existing embedding matrix
    embedding_matrix = np.load(file_path)
    print('embedding matrix found')
else:
    print('embedding matrix NOT found')
    model_path = 'cc.sv.300.bin'
    fasttext_model = load_facebook_vectors(model_path)

    # create an embedding matrix mapping each word from friends dataset to its corresponding vector in the FastText model
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, fasttext_model.vector_size))
    for word, i in tokenizer.word_index.items():
        try:
            embedding_vector = fasttext_model[word]
            if embedding_vector is not None:    
                embedding_matrix[i] = embedding_vector
        except KeyError:
            continue
    
    # save the newly created embedding matrix
    np.save(file_path, embedding_matrix)

### Hyperparameter tuning, grid search

In [None]:
from sklearn.utils.class_weight import compute_class_weight
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.metrics import make_scorer, fbeta_score
from keras.regularizers import l2
from keras.layers import Dropout
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV

# compute the class weights
class_weights = compute_class_weight('balanced', classes=[0, 1], y=labels)
class_weights = {i: weight for i, weight in enumerate(class_weights)}

# custom F3 scorer for class label 1
def f3_scorer(true_labels, predicted_labels):
    return fbeta_score(true_labels, predicted_labels, beta=3, pos_label=1)

scorer = make_scorer(f3_scorer)

# creation of the CNN model with it's layers
def create_model(optimizer, kernel_regularizer_val, dropout_rate):
    model = Sequential([
        Embedding(input_dim=len(tokenizer.word_index) + 1,
                  output_dim=300,
                  weights=[embedding_matrix],
                  input_length=max_length,
                  trainable=False),
        Conv1D(filters=256, kernel_size=3, activation='relu', 
               kernel_regularizer=l2(kernel_regularizer_val)),
        Dropout(dropout_rate),
        GlobalMaxPooling1D(),
        Dense(units=64, activation='relu', kernel_regularizer=l2(kernel_regularizer_val)),
        Dropout(dropout_rate),
        Dense(units=1, activation='sigmoid')
    ])
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model


# wrap the model for scikit-learn compatibility
model = KerasClassifier(build_fn=create_model, epochs=30, batch_size=32)

# Parameter grid for tuning
param_grid = {
    'optimizer': ['rmsprop', 'adam'],
    'kernel_regularizer_val': [0.01, 0.02],
    'dropout_rate': [0.5, 0.6]
}


# stratified K-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# gridSearchCV setup with the F3 scorer(bullying class) and class weights
grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=skf, verbose=1)

# run the gridsearch
grid_search.fit(X, labels, class_weight=class_weights)

best_params = grid_search.best_params_

# print the best parameters and best score
print("Best parameters:", best_params)
print("Best F3 score:", grid_search.best_score_)


### Evaluating with cross-validation

In [None]:
from sklearn.metrics import precision_recall_fscore_support, fbeta_score, confusion_matrix
from keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
import seaborn as sns

# perform Stratified 5-Fold cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_y_true = []
all_y_pred_prob = []

for fold, (train_idx, test_idx) in enumerate(skf.split(X, labels)):
    print(f"\nFold {fold+1}")
    X_train, X_test = X[train_idx], X[test_idx]
    y_train, y_test = labels[train_idx], labels[test_idx]
    
    model = create_model(optimizer='adam', kernel_regularizer_val=0.01, dropout_rate=0.6)
    
    early_stopping_monitor = EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)
    
    # create model using the best parameters found
    model = create_model(optimizer=best_params['optimizer'], 
                         kernel_regularizer_val=best_params['kernel_regularizer_val'], 
                         dropout_rate=best_params['dropout_rate'])
    
    # train the model
    history = model.fit(X_train, y_train, epochs=30, batch_size=32, 
                        validation_data=(X_test, y_test), 
                        verbose=2, callbacks=[early_stopping_monitor], 
                        class_weight=class_weights)

    # extract loss values for clearer access
    training_loss = history.history['loss']
    validation_loss = history.history['val_loss']

    # plot training and validation loss
    plt.figure()
    plt.plot(training_loss, label='Training Loss')
    plt.plot(validation_loss, label='Validation Loss')
    plt.title(f'Fold {fold+1}: Training and Validation Loss Over Epochs')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend()
    plt.show()  

    # predict probabilities
    y_pred_prob = model.predict(X_test)
    y_pred_prob_positive = y_pred_prob[:, 0]
    
    all_y_true.extend(y_test)
    all_y_pred_prob.extend(y_pred_prob_positive)

# convert lists to numpy arrays for easier handling
all_y_true = np.array(all_y_true)
all_y_pred_prob = np.array(all_y_pred_prob)

# plot histograms for the combined probabilities
y_true_0 = all_y_true == 0
y_true_1 = all_y_true == 1

plt.figure()
plt.hist(all_y_pred_prob[y_true_0], bins=80, alpha=0.5, label='Non-bullying (label 0)', color='blue')
plt.hist(all_y_pred_prob[y_true_1], bins=80, alpha=0.5, label='Bullying (label 1)', color='red')
plt.title('Combined Distribution of Predicted Probabilities')
plt.xlabel('Probability of Positive Class')
plt.ylabel('Frequency')
plt.legend(loc='upper right')
plt.show()


### Find optimal threshold that maximizes f3-score

In [None]:
best_threshold = None
best_conf_matrix = None
best_f3_score = -1

# evaluate different thresholds on combined results
thresholds = [0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5]
for threshold in thresholds:
    y_pred_classes = (all_y_pred_prob >= threshold).astype(int)
    precision, recall, f1, _ = precision_recall_fscore_support(all_y_true, y_pred_classes, labels=[0, 1], average=None, zero_division=0)
    f3_0 = fbeta_score(all_y_true, y_pred_classes, beta=3, average='binary', pos_label=0, zero_division=0)
    f3_1 = fbeta_score(all_y_true, y_pred_classes, beta=3, average='binary', pos_label=1, zero_division=0)
    conf_matrix = confusion_matrix(all_y_true, y_pred_classes)
    print(f"\nThreshold {threshold}:")
    print(f"Average report: Precision {precision[1]:.2f}, Recall {recall[1]:.2f}, F1-score {f1[1]:.2f}, F3-score {f3_1:.3f}")
    
        # check if this threshold has the best F3 score
    if f3_1 > best_f3_score:
        best_f3_score = f3_1
        best_threshold = threshold
        best_conf_matrix = conf_matrix

# plot confusion matrix
sns.set_theme(style='whitegrid')
plt.figure(figsize=(8, 6))
sns.heatmap(best_conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False, xticklabels=['Predicted 0', 'Predicted 1'], yticklabels=['Actual 0', 'Actual 1'])
plt.xlabel('Predicted Labels')
plt.ylabel('Actual Labels')
plt.title(f'Confusion Matrix for Threshold {best_threshold}: with Highest F3-Score ({best_f3_score:.3f})')
plt.show()