**Importing libraries**

In [None]:
import os
import pandas as pd
import tensorflow as tf
import numpy as np

**Mount Google Drive and Read CSV Files**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# After mounting, you can access files like this:
df_train = pd.read_csv('/content/drive/My Drive/Comment_toxic/jigsaw dataset/train.csv')
df_test = pd.read_csv('/content/drive/My Drive/Comment_toxic/jigsaw dataset/test.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Concatenate DataFrames and Convert Comment Text to Lowercase**

In [None]:
df = pd.concat([df_train, df_test])
df_train['comment_text'] = df_train['comment_text'].str.lower()
df_test['comment_text'] = df_test['comment_text'].str.lower()
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
df.shape

(312735, 8)

**Function to Remove Special Characters using Regular expression**

In [None]:
import re

def remove_special_characters(text):
    text = re.sub(r'http\S+', ' ', text )
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d', ' ', text)  # Corrected line
    return text

df_train['comment_text'] = df_train['comment_text'].apply(remove_special_characters)
df_test['comment_text'] = df_test['comment_text'].apply(remove_special_characters)

print(df_train['comment_text'].head(10))


0    explanation why the edits made under my userna...
1    d aww he matches this background colour i m se...
2    hey man i m really not trying to edit war it s...
3    more i can t make any real suggestions on impr...
4    you sir are my hero any chance you remember wh...
5    congratulations from me as well use the tools ...
6         cocksucker before you piss around on my work
7    your vandalism to the matt shirvington article...
8    sorry if the word nonsense was offensive to yo...
9    alignment on this subject and which are contra...
Name: comment_text, dtype: object


**Tokenize Text**

In [None]:
import nltk                     #Imports the Natural Language Toolkit (NLTK), a library for natural language processing (NLP) in Python.
nltk.download('punkt')          #Downloads the 'punkt' tokenizer models, which are used for tokenizing text into sentences or words.
from nltk import word_tokenize  #Imports the word_tokenize function from NLTK, which tokenizes a string into words.

df_train['word_tokens'] = df_train['comment_text'].apply(word_tokenize) #Applies the word_tokenize function to each entry in the comment_text column of
                                                                        #df_train and stores the resulting list of word tokens
                                                                        #in a new column called word_tokens.

df_test['word_tokens'] = df_test['comment_text'].apply(word_tokenize)   #Applies the word_tokenize function to each entry in the comment_text column of
                                                                        #df_test and stores the resulting list of word tokens in a new column
                                                                        #called word_tokens.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


**Import the train_test_split Function and Split the Data into training dataset and validation dataset**

In [None]:
from sklearn.model_selection import train_test_split
train, valid = train_test_split(df_train, train_size=0.8, random_state=42)

**TF-IDF Vectorization**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

# Define the TfidfVectorizer with specified parameters
vec = TfidfVectorizer(ngram_range=(1, 2),     #Considers unigrams and bigrams (1-word and 2-word combinations).
                      min_df=3,               #Ignores terms that appear in fewer than 3 documents.
                      max_df=0.9,             #Ignores terms that appear in more than 90% of the documents.
                      strip_accents='unicode',#Removes accents from characters.
                      use_idf=1,              #Enables the use of inverse document frequency weighting.
                      smooth_idf=1,           #Applies smoothing to the IDF weights by adding one to the document frequencies.
                      sublinear_tf=1,         #Applies sublinear term frequency scaling (using the logarithm of term frequency).
                      binary=1,               #If true, all non-zero term counts are set to 1 (boolean "occurrence" instead of "frequency").
                      stop_words='english')   #Removes common English stop words.


# Transform the training, validation, and test data
trn_term_doc = vec.fit_transform(df_train['comment_text'])    #Fits the TfidfVectorizer to the training data and transforms it into a TF-IDF matrix.
val_term_doc = vec.transform(valid['comment_text'])           #Transforms the validation data using the same vectorizer fitted on the training data.
test_term_doc = vec.transform(df_test['comment_text'])        #Transforms the test data using the same vectorizer fitted on the training data.
x = trn_term_doc                                              #Stores the TF-IDF matrix for the training data.
val_x = val_term_doc                                          #Stores the TF-IDF matrix for the validation data.

**Importing** **Libraries**

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

**Define the Probability Function**

In [None]:
epsilon = 1e-9  # Define epsilon as a small positive constant
# Define a function to calculate the probability of each word given a specific class (toxic or non-toxic)
def probability(y_i, y):
    # Sum the occurrences of each word in comments labeled with y_i (1 for toxic, 0 for non-toxic)
    occurences = x[y == y_i].sum(0)
    # Add a smoothing factor of 1 to avoid division by zero and handle words not present in some classes
    return (occurences + 1) / ((y == y_i).sum() + 1)

**Define the Logistic Model Function**

In [None]:
def get_model(y):
    # Convert the target labels to a numpy array
    y = y.values
    # Calculate the log-ratio of probabilities of each word being toxic vs. non-toxic
    loga = np.log((probability(1, y) + epsilon) / (probability(0, y) + epsilon) )
    # Multiply the input features by the log-ratio to incorporate the information about word toxicity
    x_loga = x.multiply(loga)
    # Initialize a logistic regression model with specified hyperparameters
    model = LogisticRegression(C=1.0, penalty='l2', solver='liblinear', max_iter=100, random_state=42)
    # Fit the model to the modified input features and target labels
    return model.fit(x_loga, y), loga

**Define Classes and Labels**

In [None]:
classes = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train_labels = df_train.drop(['comment_text'], axis = 1)
valid_labels = valid.drop(['comment_text'], axis = 1)

**Train Models and Evaluate**

In [None]:
# Dictionary to store ROC AUC scores for each class
model = {}
ROC_AUC_Scores = {}
for i, col in enumerate(classes):
    print(col)

    # Train model for current class
    model_trained, loga = get_model(train_labels[col])
    model[col] = (model_trained, loga)
    # Make predictions on validation set
    preds = model_trained.predict(val_x.multiply(loga)).reshape(-1, 1)

    # Calculate ROC AUC score for current class and store it
    roc_auc = roc_auc_score(valid_labels[col], preds)
    ROC_AUC_Scores[col] = roc_auc
    # Print ROC AUC scores for each class
for col, roc_auc in ROC_AUC_Scores.items():
    print(f"ROC AUC for class: '{col}': {roc_auc}")

toxic
severe_toxic
obscene
threat
insult
identity_hate
ROC AUC for class: 'toxic': 0.882298818964349
ROC AUC for class: 'severe_toxic': 0.8593699620003561
ROC AUC for class: 'obscene': 0.9121859131542873
ROC AUC for class: 'threat': 0.9661534041186063
ROC AUC for class: 'insult': 0.8711052432334527
ROC AUC for class: 'identity_hate': 0.8808416950158198
