In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Dense
from keras.optimizers import RMSprop
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report
import keras.backend as K
from sklearn.model_selection import train_test_split

In [None]:
from google.colab import files
uploaded = files.upload()

Saving test_labelled_cleaned_no_punkt.csv to test_labelled_cleaned_no_punkt.csv
Saving test_unlabelled_cleaned_no_punkt.csv to test_unlabelled_cleaned_no_punkt.csv
Saving train_cleaned_no_punkt.csv to train_cleaned_no_punkt.csv


In [None]:
# 1. Load and prepare datasets
train = pd.read_csv("train_cleaned_no_punkt.csv")
test_labelled = pd.read_csv("test_labelled_cleaned_no_punkt.csv")
test_unlabelled = pd.read_csv("test_unlabelled_cleaned_no_punkt.csv")

In [None]:
# 2. Create 'mal' label as binary
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['mal'] = (train[label_cols].sum(axis=1) >= 1).astype(int)
test_labelled['mal'] = (test_labelled[label_cols].sum(axis=1) >= 1).astype(int)

In [None]:
# 3. Drop multi-labels and fill missing text
train.drop(columns=label_cols, inplace=True)
test_labelled.drop(columns=label_cols, inplace=True)
train['comment_text'].fillna("empty", inplace=True)
test_labelled['comment_text'].fillna("empty", inplace=True)
test_unlabelled['comment_text'].fillna("empty", inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['comment_text'].fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test_labelled['comment_text'].fillna("empty", inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we 

In [None]:
# 4. Stratified split for balanced data
rs = 42
X_train1, X_test1, y_train1, y_test1 = train_test_split(train['comment_text'], train['mal'], test_size=0.29, stratify=train['mal'], random_state=rs)
X_train2, X_test2, y_train2, y_test2 = train_test_split(test_labelled['comment_text'], test_labelled['mal'], test_size=0.29, stratify=test_labelled['mal'], random_state=rs)

# 5. Combine datasets
texts = np.concatenate([X_train1.values, X_train2.values, X_test1.values, X_test2.values])
labels = np.concatenate([y_train1.values, y_train2.values, y_test1.values, y_test2.values])

In [None]:
# 6. Text Tokenization and Padding
max_features = 10000
maxlen = 100
embed_dim = 100
dropout_rate = 0.3
num_filters = 300

tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(texts)
X = tokenizer.texts_to_sequences(texts)
X = pad_sequences(X, maxlen=maxlen)
y = labels


In [None]:
# 7. CNN Model Definition
def cnn_keras(max_features, maxlen, dropout_rate, embed_dim, num_filters=300):
    if K.backend() == 'tensorflow':
        K.clear_session()

    input_layer = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, output_dim=embed_dim, trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)
    x = Conv1D(num_filters, 7, activation='relu', padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    output_layer = Dense(1, activation="sigmoid")(x)

    model = Model(inputs=input_layer, outputs=output_layer)

    model.compile(loss='binary_crossentropy',
              optimizer=RMSprop(clipnorm=1),
              metrics=['acc'])

    return model

In [None]:
# 8. 5-Fold Cross Validation
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

all_y_true = []
all_y_pred = []

for fold, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    print(f"\n--- Fold {fold} ---")
    X_train_fold, X_test_fold = X[train_idx], X[test_idx]
    y_train_fold, y_test_fold = y[train_idx], y[test_idx]

    model = cnn_keras(max_features, maxlen, dropout_rate, embed_dim, num_filters)
    model.fit(X_train_fold, y_train_fold, batch_size=32, epochs=3, verbose=1, validation_data=(X_test_fold, y_test_fold))

    y_pred_prob = model.predict(X_test_fold).ravel()
    y_pred = (y_pred_prob > 0.5).astype(int)

    all_y_true.extend(y_test_fold)
    all_y_pred.extend(y_pred)


--- Fold 1 ---
Epoch 1/3
[1m5589/5589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m625s[0m 111ms/step - acc: 0.9347 - loss: 0.1890 - val_acc: 0.9504 - val_loss: 0.1409
Epoch 2/3
[1m5589/5589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m680s[0m 111ms/step - acc: 0.9516 - loss: 0.1372 - val_acc: 0.9522 - val_loss: 0.1365
Epoch 3/3
[1m5589/5589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m614s[0m 109ms/step - acc: 0.9544 - loss: 0.1301 - val_acc: 0.9524 - val_loss: 0.1349
[1m1398/1398[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 30ms/step

--- Fold 2 ---
Epoch 1/3
[1m5589/5589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m619s[0m 110ms/step - acc: 0.9346 - loss: 0.1901 - val_acc: 0.9515 - val_loss: 0.1374
Epoch 2/3
[1m5589/5589[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m609s[0m 109ms/step - acc: 0.9517 - loss: 0.1373 - val_acc: 0.9519 - val_loss: 0.1390
Epoch 3/3
[1m3817/5589[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m2:59[0m 101ms/step - acc: 0.9544

In [None]:
# 4. Final Evaluation: Classification Report and Average Precision
print("\n--- Classification Report ---")
print(classification_report(all_y_true, all_y_pred))

# Average Precision
avg_precision = average_precision_score(all_y_true, all_y_pred_prob)
print(f"\nAverage Precision: {avg_precision:.4f}")

# 5. Plot ROC Curve and Calculate AUC
fpr, tpr, thresholds = roc_curve(all_y_true, all_y_pred_prob)
roc_auc = auc(fpr, tpr)

# Plotting ROC Curve
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, color='b', lw=2, label=f'AUC = {roc_auc:.2f}')
plt.plot([0, 1], [0, 1], color='gray', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.show()