In [1]:
pip install swifter --quiet

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing the Libraries
import pandas as pd
import numpy as np
import re, string
import swifter
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from scipy.sparse import hstack
import joblib
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Importing the training dataset
train_df = pd.read_csv("../input/toxic-data/train.csv")
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Importing the test data
test_data = pd.read_csv("../input/toxic-data/test.csv")
test_labels = pd.read_csv("../input/toxic-data/test_labels.csv")

In [5]:
# Merging the two datasets above for complete test data
test_df = pd.merge(test_data, test_labels, on="id")
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [6]:
# Filtering out the samples having actual target labels
new_test_df = test_df[(test_df['toxic']!=-1) & (test_df['severe_toxic']!=-1) & (test_df['obscene']!=-1) & 
             (test_df['threat']!=-1) & (test_df['insult']!=-1) & (test_df['identity_hate']!=-1)]
new_test_df.reset_index(drop=True, inplace=True)
new_test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [7]:
# Creating a function to clean the training dataset
def clean_text(text):
    """This function will take text as input and return a cleaned text 
        by removing html char, punctuations, non-letters, newline and converting it 
        to lower case.
    """
    # Converting to lower case letters
    text = text.lower()
    # Removing the contraction of few words
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    # Replacing the HTMl characters with " "
    text = re.sub("<.*?>", " ", text)
    # Removing the punctuations
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    # Removing non-letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # Replacing newline with space
    text = re.sub("\n", " ", text)
    # Split on space and rejoin to remove extra spaces
    text = " ".join(text.split())
    
    return text

In [8]:
def word_lemmatizer(text):
    """This function will help lemmatize words in a text.
    """
    
    lemmatizer = WordNetLemmatizer()
    # Tokenize the sentences to words
    text = word_tokenize(text)
    # Removing the stop words
    text = [lemmatizer.lemmatize(word) for word in text]
    # Joining the cleaned list
    text = " ".join(text)
    
    return text

In [9]:
# Cleaning and preprocessing the train data
train_df["comment_text"] = train_df["comment_text"].swifter.apply(clean_text)
train_df["comment_text"] = train_df["comment_text"].swifter.apply(word_lemmatizer)

# Cleaning and preprocessing the test data
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(clean_text)
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(word_lemmatizer)

Pandas Apply:   0%|          | 0/159571 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/159571 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/63978 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/63978 [00:00<?, ?it/s]

In [10]:
# Performing the train-val split to create training and validation datasets
train, validation = train_test_split(train_df, test_size=0.2, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(train.shape, validation.shape)

(127656, 8) (31915, 8)


In [11]:
# Seperating our input and target variable columns
X_train = train.comment_text
X_val = validation.comment_text
X_test = new_test_df.comment_text

In [12]:
# Storing our target labels list in a variable
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [13]:
# Creating a unigram TFIDF vectorizer and transforming all our input features
word_tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 1), sublinear_tf=True, strip_accents="unicode", 
                             analyzer="word",token_pattern=r"\w{1,}", stop_words=stop_words)

word_tfidf.fit(train_df.comment_text)

train_word_tfidf = word_tfidf.transform(X_train)
val_word_tfidf = word_tfidf.transform(X_val)
test_word_tfidf = word_tfidf.transform(X_test)

In [14]:
# Creating a n-gram (2, 6) TFIDF vectorizer and transforming all our input features
char_tfidf = TfidfVectorizer(max_features=30000, ngram_range=(2, 6), sublinear_tf=True, strip_accents="unicode", 
                             analyzer="char", stop_words=stop_words)

char_tfidf.fit(train_df.comment_text)

train_char_tfidf = char_tfidf.transform(X_train)
val_char_tfidf = char_tfidf.transform(X_val)
test_char_tfidf = char_tfidf.transform(X_test)

In [15]:
# Concatenating both unigram and n-gram features for our training input
train_features = hstack([train_word_tfidf, train_char_tfidf])
val_features = hstack([val_word_tfidf, val_char_tfidf])
test_features = hstack([test_word_tfidf, test_char_tfidf])

In [16]:
# Saving the tfidf vectors for future use
joblib.dump(word_tfidf, "word_tfidf_vectorizer.pkl")
joblib.dump(char_tfidf, "char_tfidf_vectorizer.pkl")

['char_tfidf_vectorizer.pkl']

In [17]:
# Creating a logistic regression Model and treating each target as a binary classification problem
lr_model = OneVsRestClassifier(LogisticRegression(solver="saga"))
val_results = {"Accuracy": {}, "F1 Score": {}}
test_results = {"Accuracy": {}, "F1 Score": {}}

for label in labels:
    print(f"... Processing {label}")
    # train the model using X & y
    lr_model.fit(train_features, train[label])
    # Predicting the validation data labels
    val_prediction = lr_model.predict(val_features)
    # Predicting the test data labels
    test_prediction = lr_model.predict(test_features)
    # Saving the model based on target label
    joblib.dump(lr_model, f"logistic_regression_{label}.pkl")
    # Checking and model's accuracy and f1-score
    val_results["Accuracy"][f"{label}"] = accuracy_score(validation[label], val_prediction)
    val_results["F1 Score"][f"{label}"] = f1_score(validation[label], val_prediction, average = "weighted")
    test_results["Accuracy"][f"{label}"] = accuracy_score(new_test_df[label], test_prediction)
    test_results["F1 Score"][f"{label}"] = f1_score(new_test_df[label], test_prediction, average = "weighted")

... Processing toxic
... Processing severe_toxic
... Processing obscene
... Processing threat
... Processing insult
... Processing identity_hate


In [18]:
# Evaluating the model on Validation Data
validation_accuracy = sum(val_results["Accuracy"].values())/len(val_results["Accuracy"])
print(f"Validation Accuracy: {validation_accuracy}")

validation_f1_score = sum(val_results["F1 Score"].values())/len(val_results["F1 Score"])
print(f"Validation F1-Score: {validation_f1_score}")

Validation Accuracy: 0.9828502793879577
Validation F1-Score: 0.9811947440446507


In [19]:
# Evaluating the model on Test Data
test_accuracy = sum(test_results["Accuracy"].values())/len(test_results["Accuracy"])
print(f"Test Accuracy: {test_accuracy}")

test_f1_score = sum(test_results["F1 Score"].values())/len(test_results["F1 Score"])
print(f"Test F1-Score: {test_f1_score}")

Test Accuracy: 0.9752805651942854
Test F1-Score: 0.9747181660736461
