### **Installing Required Libraries**

In [None]:
pip install transformers


In [None]:
pip install torch

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import tensorflow as tf
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.optimizers import Adam
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### **Load data from CSV file**

In [None]:
df = pd.read_csv('/content/drive/MyDrive/train.csv')

In [None]:
df.dropna(how='any', inplace=True)

In [None]:
toxic_count = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum()
print(toxic_count)

toxic            15294
severe_toxic      1595
obscene           8449
threat             478
insult            7877
identity_hate     1405
dtype: int64


In [None]:
# Create a new column called 'is_toxic' that indicates whether a comment is toxic or not
df['is_toxic'] = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].any(axis=1).astype(int)

# Create a new DataFrame to store the balanced dataset
balanced_df = pd.DataFrame(columns=df.columns)

# Sample 5000 comments from each class
for label in ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']:
    tmp = df[df[label] == 1].sample(n=5000, replace=True)
    balanced_df = balanced_df.append(tmp, ignore_index=True)

# Drop the columns 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'
balanced_df = balanced_df.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1)

# Print the first few rows of the balanced dataset
print(balanced_df.head())


In [None]:
balanced_df.shape[0]

### **Data pre-processing**

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [None]:
def clean_text(text):
    # Remove special characters and digits
    text = re.sub('[^a-zA-Z]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize
    words = text.split()
    
    # Remove stop words and lemmatize
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # Join words back into text
    clean_text = ' '.join(words)
    
    return clean_text

In [None]:
df['comment_text'] = df['comment_text'].apply(clean_text)

### **Encode labels as integers**

In [None]:
label_encoder = LabelEncoder()
df['is_toxic'] = label_encoder.fit_transform(df['is_toxic'])

### **Split data into training and testing sets**

In [None]:
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['is_toxic'], random_state=42)

In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna()

### **Load BERT tokenizer**

In [None]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

### **Tokenize text and add special tokens**

In [None]:
train_encodings = tokenizer(list(train_df['comment_text']), truncation=True, padding=True)
test_encodings = tokenizer(list(test_df['comment_text']), truncation=True, padding=True)

### **Convert data to TensorFlow datasets**

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_df['is_toxic'].values
)).shuffle(len(train_df)).batch(4)

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_df['is_toxic'].values
)).batch(4)

### **Build BERT model**

In [None]:
model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

In [None]:
tf.keras.mixed_precision.set_global_policy('mixed_float16')

### **Compile model**

In [None]:
optimizer = Adam(learning_rate=2e-5, epsilon=1e-8)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

### **Train model**

In [None]:
history = model.fit(train_dataset, epochs=1, validation_data=test_dataset)

### **Evaluate model**

In [None]:
test_predictions = model.predict(test_dataset)
test_predictions = np.argmax(test_predictions.logits, axis=-1)
test_labels = test_df['toxic'].values
test_f1 = f1_score(test_labels, test_predictions)

print('Test F1 score:', test_f1)

### **Saving Model**

In [None]:
import pickle

# Save the best model to disk
with open('model.pkl', 'wb') as file:
    pickle.dump(model, file)


### **Testing the Output**

In [None]:
sample_comment = "You fool"
sample_comment = preprocess(sample_comment)
pred = model.predict([sample_comment])
if pred[0] == 1:
    print("The comment is toxic.")
else:
    print("The comment is not toxic.")