In [None]:
# Importing the Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re, string
import swifter
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from skmultilearn.adapt import MLkNN
import xgboost as xgb
import warnings
warnings.filterwarnings("ignore")
plt.style.use("ggplot")

In [None]:
# Importing the training dataset
train_df = pd.read_csv("D:/Projects/toxic-comment-classification/data/raw-data/train.csv")
train_df.head()

In [None]:
# Importing the test data
test_data = pd.read_csv("D:/Projects/toxic-comment-classification/data/raw-data/test.csv")
test_labels = pd.read_csv("D:/Projects/toxic-comment-classification/data/raw-data/test_labels.csv")

In [None]:
# Merging the two datasets above for complete test data
test_df = pd.merge(test_data, test_labels, on="id")
test_df.head()

In [None]:
# Filtering out the samples having actual target labels
new_test_df = test_df[(test_df['toxic']!=-1) & (test_df['severe_toxic']!=-1) & (test_df['obscene']!=-1) & 
             (test_df['threat']!=-1) & (test_df['insult']!=-1) & (test_df['identity_hate']!=-1)]
new_test_df.reset_index(drop=True, inplace=True)
new_test_df.head()

In [None]:
# Creating a function to clean the training dataset
def clean_text(text):
    """This function will take text as input and return a cleaned text 
        by removing html char, punctuations, non-letters, newline and converting it 
        to lower case.
    """
    # Converting to lower case letters
    text = text.lower()
    # Removing the contraction of few words
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    # Replacing the HTMl characters with " "
    text = re.sub("<.*?>", " ", text)
    # Removing the punctuations
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    # Removing non-letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # Replacing newline with space
    text = re.sub("\n", " ", text)
    # Split on space and rejoin to remove extra spaces
    text = " ".join(text.split())
    
    return text

In [None]:
def word_lemmatizer(text):
    """This function will help lemmatize words in a text"""
    
    lemmatizer = WordNetLemmatizer()
    # Tokenize the sentences to words
    text = word_tokenize(text)
    # Removing the stop words
    text = [lemmatizer.lemmatize(word) for word in text]
    # Joining the cleaned list
    text = " ".join(text)
    
    return text

In [None]:
# Cleaning and preprocessing the train data
train_df["comment_text"] = train_df["comment_text"].swifter.apply(clean_text)
train_df["comment_text"] = train_df["comment_text"].swifter.apply(word_lemmatizer)

# Cleaning and preprocessing the test data
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(clean_text)
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(word_lemmatizer)

In [None]:
 # Seperating our input and target variable columns
X = train_df["comment_text"]
y = train_df[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [None]:
# Performing the train-val split to create training and validation datasets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [None]:
# Creating a list of all the text comments from our cleaned dataset
train_corpus = list(train_df.comment_text.values)
print(len(train_corpus))

In [None]:
# Creating a TFIDF vectorizer on the whole training dataset
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 1), sublinear_tf=True, strip_accents="unicode", 
                             analyzer="word",token_pattern=r"\w{1,}", stop_words=stop_words)

vectorizer.fit(train_corpus)

In [None]:
# Transforming the input and val independent data
X_train = vectorizer.transform(X_train)
X_val = vectorizer.transform(X_val)

In [None]:
X_train.toarray()

In [None]:
lr = BinaryRelevance(LogisticRegression(C=2, class_weight = 'balanced'))
lr.fit(X_train, y_train)

In [None]:
y_pred = lr.predict(X_val)

In [None]:
# Computing the f-1 score for our model.
f1_score_knn = f1_score(y_train, y_pred, average = "weighted")
f1_score_knn