In [32]:
import re
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, f1_score, hamming_loss, accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from google.colab import drive
from os.path import join
import nltk
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.model_selection import KFold

nltk.download('stopwords', quiet=True)

True

In [3]:
drive.mount('/content/drive/', force_remount=False)
DATA_PATH = '/content/drive/My Drive/Colab Notebooks/ML4MDE_Project/'


file = join(DATA_PATH,'dataset.csv')
df = pd.read_csv(file, header=0)

Mounted at /content/drive/


<hr />
<h2>Data Preprocessing</h2>

In [4]:
contraction_mapping = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'd've": "I would have",
    "I'll": "I will",
    "I'll've": "I will have",
    "I'm": "I am",
    "I've": "I have",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "this's": "this is",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "here's": "here is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have",
}

stop_words = set(nltk.corpus.stopwords.words('english'))

def clean_text(text: str):
    to_clean = text.lower()
    to_clean = to_clean.replace('"', '')
    to_clean = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in to_clean.split(" ")])
    to_clean = re.sub("[^a-zA-Z]", " ", to_clean)
    words = [word for word in to_clean.split() if word not in stop_words and len(word) > 1]
    return " ".join(words).strip()

In [5]:
df['comment_text'] = df['comment_text'].apply(clean_text)
X = df['comment_text'].values
y = df[df.columns[2:]].values

In [6]:
# TextVectorization
MAX_FEATURES = 10000
MAX_SEQUENCE = 100
vectorizer = TextVectorization(max_tokens=MAX_FEATURES, output_sequence_length=MAX_SEQUENCE, output_mode='int')
vectorizer.adapt(X)
X = np.array(vectorizer(X))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [34]:
NUM_EPOCHS = 3
BATCH_SIZE = 32

# K-FOLD CROSS VALIDATION
NUM_FOLDS = 2
KF = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=10)

model = Sequential()

model.add(Embedding(MAX_FEATURES + 1, 32))

model.add(Bidirectional(LSTM(32, activation='tanh')))

model.add(Dense(64, activation='relu'))
model.add(Dense(64, activation='relu'))

model.add(Dense(6, activation='sigmoid'))

model.compile(loss='BinaryCrossentropy', metrics=['accuracy'], optimizer='Adam')

index = 1

for train_index, val_index in KF.split(X_train):
  print(f"Fold {index}")
  X_fold_train, X_fold_val = X_train[train_index], X_train[val_index]
  y_fold_train, y_fold_val = y_train[train_index], y_train[val_index]

  model.fit(X_fold_train, y_fold_train, epochs=NUM_EPOCHS, batch_size=BATCH_SIZE, validation_data=(X_fold_val, y_fold_val))

  scores = model.evaluate(X_fold_val, y_fold_val)
  print(f"Validation Accuracy: {scores[1]*100:.2f}%")
  index += 1

Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Accuracy: 99.4187%
Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Accuracy: 99.3968%


In [35]:
model.summary()

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, None, 32)          352       
                                                                 
 bidirectional_7 (Bidirecti  (None, 64)                16640     
 onal)                                                           
                                                                 
 dense_21 (Dense)            (None, 64)                4160      
                                                                 
 dense_22 (Dense)            (None, 64)                4160      
                                                                 
 dense_23 (Dense)            (None, 6)                 390       
                                                                 
Total params: 25702 (100.40 KB)
Trainable params: 25702 (100.40 KB)
Non-trainable params: 0 (0.00 Byte)
________________

In [None]:
test = vectorizer('You are ugly!')
predict = model.predict(np.expand_dims(test, 0))
(predict > 0.5).astype(int)

In [None]:
# Valutazione del modello sul set di test
y_pred = model.predict(X_test)
y_pred = (y_pred > 0.5).astype(int)

labels = ['toxic', 'sever toxic', 'obscene', 'threat', 'insult', 'identity hate']

# 1. Precision, Recall, F1-Score per ogni etichetta (0.5 valore di default comunemente utilizzato per la classificazione binaria)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

for i in range(len(precision)):
    print(f'Label {labels[i]}: Precision = {precision[i]:.4f}, Recall = {recall[i]:.4f}, F1-Score = {f1[i]:.4f}')

In [38]:
# 2. Hamming Loss
# (L'Hamming Loss è una metrica utilizzata per valutare la precisione di un modello di classificazione multi-etichetta.
# Essa misura la frazione di label classificate in modo scorretto rispetto al numero totale di label.
# L'obiettivo è minimizzare questa metrica, quindi un valore più basso di Hamming Loss indica una migliore precisione.)
hamming_loss_value = hamming_loss(y_test, y_pred)
print(f'Hamming Loss: {hamming_loss_value:.4f}')

Hamming Loss: 0.0362


In [39]:
# 3. Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.9003


In [None]:
# 4. Area under the ROC curve (ROC AUC) per ogni etichetta
# Utilizzata per valutare le prestazioni di un modello di classificazione binaria al variare della soglia di decisione.
# La ROC è creata rappresentando il tasso di vera positività (True Positive Rate, TPR)
# rispetto al tasso di falsi positivi (False Positive Rate, FPR) al variare della soglia di decisione.
roc_auc = roc_auc_score(y_test, y_pred)
print(f'ROC AUC: {roc_auc:.4f}')

# Binarizza le etichette
y_test_binarized = label_binarize(y_test, classes=[0, 1, 2, 3, 4, 5])
n_classes = y_test_binarized.shape[1]

# Calcola la curva ROC e l'area sotto la curva (AUC) per ogni etichetta
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test_binarized[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Disegna la curva ROC per ogni etichetta
plt.figure(figsize=(10, 8))

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label=f'ROC curve (area = {roc_auc[i]:.2f}) for label {labels[i]}')

plt.plot([0, 1], [0, 1], 'k--', lw=2)  # Linea diagonale tratteggiata
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()

In [25]:
model.save('toxicity.h5')
model.save('toxicity.keras')

  saving_api.save_model(
