1. Download and explore the data
2. Prepare the data for training
2. Build a recurrent neural network
3. Train & evaluate the model

## Import dataset

In [1]:
# read csv data
import pandas as pd  # Import the pandas library
# Assuming the files are in the current working directory after extraction:
train_df = pd.read_csv('/content/train.csv')
test_df = pd.read_csv('/content/test.csv')

train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [2]:
#working with 50000 data
train_df = train_df.iloc[:50000]

In [5]:
test_df.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [6]:
train_df.shape

(50000, 8)

In [7]:
train_df.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
22331,3ae80aa9e5ab6519,I am just starting new section on a dissuion p...,0,0,0,0,0,0
2534,06c4ea8162e19d40,Who exactly was Joseph Henry Jackson? Does any...,0,0,0,0,0,0
25595,43c0bb94876dd16d,opening paragraph \n\ni restructured the openi...,0,0,0,0,0,0
9619,197a30f812d46b80,As you have removed information i have contrib...,0,0,0,0,0,0
40987,6d602816357e8e64,In 75 years....article...????? In 75 years pe...,0,0,0,0,0,0


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             50000 non-null  object
 1   comment_text   50000 non-null  object
 2   toxic          50000 non-null  int64 
 3   severe_toxic   50000 non-null  int64 
 4   obscene        50000 non-null  int64 
 5   threat         50000 non-null  int64 
 6   insult         50000 non-null  int64 
 7   identity_hate  50000 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 3.1+ MB


In [9]:
train_df.comment_text.values[1]

"D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"

In [10]:
target_col = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [11]:
# Getting all comments categories 0 and 1 ratio
for col in target_col:
    print(train_df[col].value_counts(normalize=True))

toxic
0    0.90234
1    0.09766
Name: proportion, dtype: float64
severe_toxic
0    0.9897
1    0.0103
Name: proportion, dtype: float64
obscene
0    0.9471
1    0.0529
Name: proportion, dtype: float64
threat
0    0.99664
1    0.00336
Name: proportion, dtype: float64
insult
0    0.95102
1    0.04898
Name: proportion, dtype: float64
identity_hate
0    0.99114
1    0.00886
Name: proportion, dtype: float64


In [12]:
y = train_df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].values

## Prepare Dataset for Training


*   Convert text to TF-IDF vectors
*   Split training & validation set

In [13]:

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional , SimpleRNN , Dropout , BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier

In [18]:
#Clean the text data to remove unnecessary characters, punctuation, and normalize the text.
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Download NLTK resources (run once if needed)
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Add this line to download the missing resource

# Function to clean and preprocess text
def preprocess_text(text):
    # Remove non-alphanumeric characters and multiple spaces
    text = re.sub(r'\W+', ' ', text)
    text = re.sub(r'\s+', ' ', text)

    # Convert to lowercase
    text = text.lower()

    # Tokenize words
    tokens = word_tokenize(text)

    # Remove stopwords and lemmatize tokens
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words and not word.isnumeric()]

    # Join tokens back into text
    processed_text = ' '.join(filtered_tokens)

    return processed_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [19]:
# If you want to reset the index to start from 0, use:
train_df = train_df.reset_index(drop=True)
sample_text = train_df['comment_text'][0]
print(sample_text)

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27


In [20]:
preprocess_text(sample_text)

'explanation edits made username hardcore metallica fan reverted vandalism closure gas voted new york doll fac please remove template talk page since retired'

In [21]:
# Apply preprocessing function to 'comment_text' column
train_df['clean_comment'] = train_df['comment_text'].apply(preprocess_text)

In [22]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,clean_comment
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,explanation edits made username hardcore metal...
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,aww match background colour seemingly stuck th...
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,hey man really trying edit war guy constantly ...
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,make real suggestion improvement wondered sect...
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,sir hero chance remember page


In [23]:
# Applying TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features= 1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(train_df['clean_comment'])
print(tfidf_matrix.shape)

(50000, 1000)


In [24]:
from collections import Counter
print('toxic',sorted(Counter(train_df['toxic']).items()))
print('severe_toxic',sorted(Counter(train_df['severe_toxic']).items()))
print('obscene',sorted(Counter(train_df['obscene']).items()))
print('threat',sorted(Counter(train_df['threat']).items()))
print('insult',sorted(Counter(train_df['insult']).items()))
print('identity_hate',sorted(Counter(train_df['identity_hate']).items()))

toxic [(0, 45117), (1, 4883)]
severe_toxic [(0, 49485), (1, 515)]
obscene [(0, 47355), (1, 2645)]
threat [(0, 49832), (1, 168)]
insult [(0, 47551), (1, 2449)]
identity_hate [(0, 49557), (1, 443)]


In [25]:
#split the data
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(tfidf_matrix, y, test_size=0.3, random_state=42)

In [26]:
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)
print("y_train shape:", y_train.shape)
print("y_val shape:", y_val.shape)


X_train shape: (35000, 1000)
X_val shape: (15000, 1000)
y_train shape: (35000, 6)
y_val shape: (15000, 6)


In [27]:
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Assuming `y_train` contains the labels and `tfidf_matrix` is your TF-IDF matrix

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(tfidf_matrix, y, test_size=0.2, random_state=42)

# Initialize the One-vs-Rest Logistic Regression model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print(classification_report(y_val, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))


               precision    recall  f1-score   support

        toxic       0.47      0.79      0.59      1042
 severe_toxic       0.18      0.89      0.30       101
      obscene       0.46      0.86      0.60       539
       threat       0.08      0.79      0.15        29
       insult       0.37      0.82      0.51       493
identity_hate       0.11      0.78      0.20        77

    micro avg       0.36      0.82      0.50      2281
    macro avg       0.28      0.82      0.39      2281
 weighted avg       0.42      0.82      0.54      2281
  samples avg       0.05      0.08      0.06      2281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
from sklearn.metrics import accuracy_score

# Predict on the validation set
y_pred = model.predict(X_val)

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)

print(f'Overall Accuracy: {overall_accuracy}')


Overall Accuracy: 0.9382333333333333


### Decision Tree

In [29]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize the One-vs-Rest Decision Tree model
decision_tree_model = OneVsRestClassifier(DecisionTreeClassifier(class_weight='balanced'))

# Train the model
decision_tree_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_dt = decision_tree_model.predict(X_val)

# Evaluate the model
print("Decision Tree Classification Report:")
print(classification_report(y_val, y_pred_dt, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies_dt = [accuracy_score(y_val[i], y_pred_dt[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy_dt = np.mean(sample_accuracies_dt)
print(f'Decision Tree Overall Accuracy: {overall_accuracy_dt}')


Decision Tree Classification Report:
               precision    recall  f1-score   support

        toxic       0.46      0.66      0.54      1042
 severe_toxic       0.10      0.43      0.16       101
      obscene       0.47      0.75      0.58       539
       threat       0.14      0.52      0.22        29
       insult       0.33      0.61      0.43       493
identity_hate       0.09      0.48      0.15        77

    micro avg       0.35      0.65      0.46      2281
    macro avg       0.26      0.57      0.35      2281
 weighted avg       0.40      0.65      0.49      2281
  samples avg       0.05      0.06      0.05      2281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Decision Tree Overall Accuracy: 0.9412666666666666


## Random Forest

In [30]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
import numpy as np

# Initialize the One-vs-Rest Random Forest model
random_forest_model = OneVsRestClassifier(RandomForestClassifier(class_weight='balanced'))

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict on the validation set
y_pred_rf = random_forest_model.predict(X_val)

# Evaluate the model
print("Random Forest Classification Report:")
print(classification_report(y_val, y_pred_rf, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies_rf = [accuracy_score(y_val[i], y_pred_rf[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy_rf = np.mean(sample_accuracies_rf)
print(f'Random Forest Overall Accuracy: {overall_accuracy_rf}')


Random Forest Classification Report:
               precision    recall  f1-score   support

        toxic       0.68      0.58      0.62      1042
 severe_toxic       0.08      0.22      0.12       101
      obscene       0.59      0.70      0.64       539
       threat       0.27      0.10      0.15        29
       insult       0.48      0.57      0.52       493
identity_hate       0.08      0.22      0.12        77

    micro avg       0.50      0.57      0.53      2281
    macro avg       0.36      0.40      0.36      2281
 weighted avg       0.56      0.57      0.56      2281
  samples avg       0.05      0.05      0.05      2281



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Random Forest Overall Accuracy: 0.9620833333333332


In [31]:
!pip install imbalanced-learn



In [32]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
import scipy.sparse as sp

def manual_balance(X, y):
    df_X = pd.DataFrame(X)
    df_y = pd.DataFrame(y, columns=[f'label_{i}' for i in range(y.shape[1])])

    # Track indices for resampling
    balanced_indices = []

    for label in df_y.columns:
        # Find minority and majority class indices
        minority_indices = df_y[df_y[label] == 1].index
        majority_indices = df_y[df_y[label] == 0].index

        # Calculate the number of samples needed to balance
        num_to_duplicate = len(majority_indices) - len(minority_indices)

        if num_to_duplicate > 0:
            # Duplicate minority samples to balance
            duplicated_indices = np.random.choice(minority_indices, num_to_duplicate, replace=True)
            resampled_indices = np.concatenate([minority_indices, duplicated_indices])
        else:
            resampled_indices = minority_indices

        balanced_indices.extend(resampled_indices)

    # Drop duplicates and create balanced dataset
    balanced_indices = list(set(balanced_indices))
    balanced_df_X = df_X.loc[balanced_indices]
    balanced_df_y = df_y.loc[balanced_indices]

    return balanced_df_X.values, balanced_df_y.values

# Convert TF-IDF matrix to dense format (if sparse)
if sp.issparse(tfidf_matrix):
    tfidf_matrix = tfidf_matrix.toarray()

# Apply manual balancing
X_balanced, y_balanced = manual_balance(tfidf_matrix, y)

# Split the balanced data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Initialize the One-vs-Rest Logistic Regression model
model = OneVsRestClassifier(LogisticRegression(max_iter=1000, class_weight='balanced'))

# Train the model
model.fit(X_train, y_train)

# Predict on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
print("Logistic Regression Classification Report after Manual Balancing:")
print(classification_report(y_val, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy after Manual Balancing: {overall_accuracy}')


Logistic Regression Classification Report after Manual Balancing:
               precision    recall  f1-score   support

        toxic       0.96      0.82      0.89       985
 severe_toxic       0.33      0.73      0.46       114
      obscene       0.88      0.71      0.79       511
       threat       0.22      0.61      0.33        28
       insult       0.72      0.64      0.68       481
identity_hate       0.37      0.66      0.48        93

    micro avg       0.76      0.74      0.75      2212
    macro avg       0.58      0.69      0.60      2212
 weighted avg       0.82      0.74      0.77      2212
  samples avg       0.67      0.71      0.66      2212



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Overall Accuracy after Manual Balancing: 0.8216941480763014


In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the One-vs-Rest Decision Tree model
decision_tree_model = OneVsRestClassifier(DecisionTreeClassifier())

# Train the model
decision_tree_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = decision_tree_model.predict(X_val)

# Evaluate the model
print("Decision Tree Classification Report after Manual Balancing:")
print(classification_report(y_val, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy after Manual Balancing: {overall_accuracy}')


Decision Tree Classification Report after Manual Balancing:
               precision    recall  f1-score   support

        toxic       0.95      0.96      0.96       985
 severe_toxic       0.32      0.21      0.26       114
      obscene       0.71      0.73      0.72       511
       threat       0.18      0.14      0.16        28
       insult       0.60      0.62      0.61       481
identity_hate       0.40      0.35      0.38        93

    micro avg       0.77      0.76      0.76      2212
    macro avg       0.53      0.50      0.51      2212
 weighted avg       0.76      0.76      0.76      2212
  samples avg       0.79      0.81      0.75      2212



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Overall Accuracy after Manual Balancing: 0.831393469123828


In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the One-vs-Rest Random Forest model
random_forest_model = OneVsRestClassifier(RandomForestClassifier(n_estimators=100, n_jobs=-1))

# Train the model
random_forest_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = random_forest_model.predict(X_val)

# Evaluate the model
print("Random Forest Classification Report after Manual Balancing:")
print(classification_report(y_val, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy after Manual Balancing: {overall_accuracy}')


Random Forest Classification Report after Manual Balancing:
               precision    recall  f1-score   support

        toxic       0.96      1.00      0.98       985
 severe_toxic       0.33      0.04      0.06       114
      obscene       0.81      0.75      0.78       511
       threat       0.50      0.11      0.18        28
       insult       0.68      0.65      0.67       481
identity_hate       0.59      0.26      0.36        93

    micro avg       0.85      0.78      0.81      2212
    macro avg       0.64      0.47      0.50      2212
 weighted avg       0.81      0.78      0.78      2212
  samples avg       0.87      0.83      0.81      2212

Overall Accuracy after Manual Balancing: 0.8692208212091821


In [35]:
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Initialize the One-vs-Rest SVM model
svm_model = OneVsRestClassifier(SVC(kernel='linear', probability=True))

# Train the model
svm_model.fit(X_train, y_train)

# Predict on the validation set
y_pred = svm_model.predict(X_val)

# Evaluate the model
print("SVM Classification Report after Manual Balancing:")
print(classification_report(y_val, y_pred, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy after Manual Balancing: {overall_accuracy}')


SVM Classification Report after Manual Balancing:
               precision    recall  f1-score   support

        toxic       0.96      1.00      0.98       985
 severe_toxic       0.00      0.00      0.00       114
      obscene       0.87      0.71      0.78       511
       threat       0.64      0.25      0.36        28
       insult       0.73      0.62      0.67       481
identity_hate       0.65      0.33      0.44        93

    micro avg       0.88      0.76      0.82      2212
    macro avg       0.64      0.49      0.54      2212
 weighted avg       0.82      0.76      0.78      2212
  samples avg       0.89      0.81      0.82      2212



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Overall Accuracy after Manual Balancing: 0.8769802780472035


In [36]:
!pip install keras-tuner


Collecting keras-tuner
  Downloading keras_tuner-1.4.7-py3-none-any.whl.metadata (5.4 kB)
Collecting kt-legacy (from keras-tuner)
  Downloading kt_legacy-1.0.5-py3-none-any.whl.metadata (221 bytes)
Downloading keras_tuner-1.4.7-py3-none-any.whl (129 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.1/129.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading kt_legacy-1.0.5-py3-none-any.whl (9.6 kB)
Installing collected packages: kt-legacy, keras-tuner
Successfully installed keras-tuner-1.4.7 kt-legacy-1.0.5


In [37]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Define parameters
maxlen = 500  # Maximum length of sequences (adjust based on your dataset)

# Pad sequences to ensure they are of the same length
X_padded = pad_sequences(X_balanced, maxlen=maxlen, padding='post')

# Split the padded data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_balanced, test_size=0.2, random_state=42)

# Define the RNN model with increased complexity
model = Sequential()
model.add(Embedding(input_dim=X_train.shape[1], output_dim=256, input_length=maxlen))  # Increased embedding dimension
model.add(SimpleRNN(units=128, return_sequences=True))  # Increased number of units and added return_sequences=True
model.add(Dropout(0.5))  # Added Dropout layer to prevent overfitting
model.add(SimpleRNN(units=64, return_sequences=False))  # Added another RNN layer
model.add(Dropout(0.5))  # Added Dropout layer
model.add(Dense(units=y_train.shape[1], activation='sigmoid'))  # Use sigmoid for multi-label classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the early stopping callback
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor the validation loss
                               patience=3,  # Number of epochs with no improvement to wait before stopping
                               restore_best_weights=True)  # Restore model weights from the epoch with the best value of the monitored quantity

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=10,  # Increased maximum number of epochs
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])  # Include the early stopping callback

# Predict on the validation set
y_pred = model.predict(X_val)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary

# Evaluate the model
print("RNN Classification Report with Early Stopping:")
print(classification_report(y_val, y_pred_binary, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred_binary[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy with Early Stopping: {overall_accuracy}')




Epoch 1/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 953ms/step - accuracy: 0.7383 - loss: 0.5362 - val_accuracy: 0.9554 - val_loss: 0.4025
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 850ms/step - accuracy: 0.9096 - loss: 0.4558 - val_accuracy: 0.9554 - val_loss: 0.4044
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 892ms/step - accuracy: 0.9249 - loss: 0.4399 - val_accuracy: 0.9554 - val_loss: 0.4000
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 850ms/step - accuracy: 0.9351 - loss: 0.4268 - val_accuracy: 0.9554 - val_loss: 0.3983
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m56s[0m 869ms/step - accuracy: 0.9281 - loss: 0.4319 - val_accuracy: 0.9554 - val_loss: 0.4011
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 827ms/step - accuracy: 0.9370 - loss: 0.4155 - val_accuracy: 0.9554 - val_loss: 0.3975
Epoch 7/10
[1m65/65[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Overall Accuracy with Early Stopping: 0.7927578402845135


In [38]:
X_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int32)

In [39]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# Define parameters
maxlen = 500  # Maximum length of sequences (adjust based on your dataset)

# Pad sequences to ensure they are of the same length
X_padded = pad_sequences(X_balanced, maxlen=maxlen, padding='post')

# Split the padded data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_padded, y_balanced, test_size=0.2, random_state=42)

# Define the LSTM model with increased complexity
model = Sequential()
model.add(Embedding(input_dim=X_train.shape[1], output_dim=256, input_length=maxlen))  # Increased embedding dimension
model.add(LSTM(units=128, return_sequences=True))  # LSTM layer with 128 units
model.add(Dropout(0.5))  # Added Dropout layer to prevent overfitting
model.add(LSTM(units=64, return_sequences=False))  # Second LSTM layer with 64 units
model.add(Dropout(0.5))  # Added Dropout layer
model.add(Dense(units=y_train.shape[1], activation='sigmoid'))  # Output layer with sigmoid activation for multi-label classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Define the early stopping callback
early_stopping = EarlyStopping(monitor='val_loss',  # Monitor the validation loss
                               patience=3,  # Number of epochs with no improvement to wait before stopping
                               restore_best_weights=True)  # Restore model weights from the epoch with the best value of the monitored quantity

# Train the model with early stopping
history = model.fit(X_train, y_train, epochs=10,  # Increased maximum number of epochs
                    batch_size=64,
                    validation_data=(X_val, y_val),
                    callbacks=[early_stopping])  # Include the early stopping callback

# Predict on the validation set
y_pred = model.predict(X_val)
y_pred_binary = (y_pred > 0.5).astype(int)  # Convert probabilities to binary

# Evaluate the model
print("LSTM Classification Report with Early Stopping:")
print(classification_report(y_val, y_pred_binary, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))

# Calculate accuracy for each sample
sample_accuracies = [accuracy_score(y_val[i], y_pred_binary[i]) for i in range(len(y_val))]

# Calculate overall accuracy
overall_accuracy = np.mean(sample_accuracies)
print(f'Overall Accuracy with Early Stopping: {overall_accuracy}')


Epoch 1/10




[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 2s/step - accuracy: 0.8640 - loss: 0.4631 - val_accuracy: 0.9554 - val_loss: 0.3916
Epoch 2/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 2s/step - accuracy: 0.9507 - loss: 0.4091 - val_accuracy: 0.9554 - val_loss: 0.3928
Epoch 3/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 2s/step - accuracy: 0.9485 - loss: 0.4104 - val_accuracy: 0.9554 - val_loss: 0.3940
Epoch 4/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 2s/step - accuracy: 0.9440 - loss: 0.4046 - val_accuracy: 0.9554 - val_loss: 0.3910
Epoch 5/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 2s/step - accuracy: 0.9465 - loss: 0.4038 - val_accuracy: 0.9554 - val_loss: 0.3912
Epoch 6/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 2s/step - accuracy: 0.9439 - loss: 0.3985 - val_accuracy: 0.9554 - val_loss: 0.3908
Epoch 7/10
[1m65/65[0m [32m━━━━━━━━━━━━━━━

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Overall Accuracy with Early Stopping: 0.7927578402845135


In [42]:
import numpy as np
# import gensim # gensim is installed in the previous cell
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split # Import train_test_split

# ------------------------
# 1. Prepare Data
# ------------------------
# Assuming you already have train_df with 'clean_comment' and y (labels)
max_words = 20000
maxlen = 100

# Split the train_df into training and validation sets for both text and labels
X_train_text, X_val_text, y_train, y_val = train_test_split(train_df['clean_comment'], y, test_size=0.2, random_state=42)

# Tokenize and pad sequences for training and validation sets
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(X_train_text)
X_train_seq = tokenizer.texts_to_sequences(X_train_text)
X_val_seq = tokenizer.texts_to_sequences(X_val_text) # Use X_val_text for validation sequences

X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
X_val_pad = pad_sequences(X_val_seq, maxlen=maxlen) # Use X_val_seq for validation padding

# Prepare test data
test_df['clean_comment'] = test_df['comment_text'].apply(preprocess_text) # Preprocess test data
X_test_text = test_df['clean_comment']
X_test_seq = tokenizer.texts_to_sequences(X_test_text)
X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)

# For evaluation purposes, we will use a split of the original training data
# You would typically evaluate on a separate test set with labels if available
# For this example, we will use the validation set (X_val_pad, y_val) for evaluation
X_eval_pad = X_val_pad
y_eval = y_val


# ------------------------
# 2. Load Word2Vec embeddings (Assuming you have the file 'GoogleNews-vectors-negative300.bin')
# ------------------------
# You will need to download the 'GoogleNews-vectors-negative300.bin' file and place it in the correct directory.
# This part of the code assumes gensim is installed and the file is available.
embedding_dim = 300
word_index = tokenizer.word_index

try:
    w2v_path = 'GoogleNews-vectors-negative300.bin'
    w2v_model = gensim.models.KeyedVectors.load_word2vec_format(w2v_path, binary=True)

    # Create embedding matrix
    embedding_matrix = np.zeros((len(word_index) + 1, embedding_dim))
    for word, i in word_index.items():
        if word in w2v_model:
            embedding_matrix[i] = w2v_model[word]

    # ------------------------
    # 3. Build LSTM Model
    # ------------------------
    model = Sequential()
    model.add(Embedding(input_dim=len(word_index)+1,
                        output_dim=embedding_dim,
                        weights=[embedding_matrix],
                        input_length=maxlen,
                        trainable=False))  # freeze embeddings
    model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(y_train.shape[1], activation='sigmoid'))  # multi-label
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # ------------------------
    # 4. Train Model
    # ------------------------
    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    history = model.fit(X_train_pad, y_train,
                        validation_data=(X_val_pad, y_val), # Use validation data for monitoring
                        epochs=10,
                        batch_size=64,
                        callbacks=[early_stop],
                        verbose=1)

    # ------------------------
    # 5. Evaluate Accuracy
    # ------------------------
    y_pred = (model.predict(X_eval_pad) > 0.5).astype(int)

    # For multi-label classification, strict accuracy
    acc = accuracy_score(y_eval, y_pred)
    print(f"LSTM with Word2Vec Accuracy: {acc:.4f}")

except FileNotFoundError:
    print(f"Error: Word2Vec file '{w2v_path}' not found.")
    print("Please download the Google News Word2Vec model and place it in the correct directory.")
except Exception as e:
    print(f"An error occurred: {e}")

An error occurred: name 'gensim' is not defined


In [43]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.4 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.6/26.6 MB[0m [31m24.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.0 MB)
[2K   [90m━━━━━━━━━━━