In [1]:
pip install swifter --quiet



In [80]:
# Importing the Libraries
import pandas as pd
import numpy as np
import re, string
import swifter
from nltk.corpus import stopwords
stop_words = set(stopwords.words("english"))
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

In [3]:
# Importing the training dataset
train_df = pd.read_csv("../input/toxic-data/train.csv")
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# Importing the test data
test_data = pd.read_csv("../input/toxic-data/test.csv")
test_labels = pd.read_csv("../input/toxic-data/test_labels.csv")

In [5]:
# Merging the two datasets above for complete test data
test_df = pd.merge(test_data, test_labels, on="id")
test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,-1,-1,-1,-1,-1,-1
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,-1,-1,-1,-1,-1,-1
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",-1,-1,-1,-1,-1,-1
3,00017563c3f7919a,":If you have a look back at the source, the in...",-1,-1,-1,-1,-1,-1
4,00017695ad8997eb,I don't anonymously edit articles at all.,-1,-1,-1,-1,-1,-1


In [6]:
# Filtering out the samples having actual target labels
new_test_df = test_df[(test_df['toxic']!=-1) & (test_df['severe_toxic']!=-1) & (test_df['obscene']!=-1) & 
             (test_df['threat']!=-1) & (test_df['insult']!=-1) & (test_df['identity_hate']!=-1)]
new_test_df.reset_index(drop=True, inplace=True)
new_test_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,Thank you for understanding. I think very high...,0,0,0,0,0,0
1,000247e83dcc1211,:Dear god this site is horrible.,0,0,0,0,0,0
2,0002f87b16116a7f,"""::: Somebody will invariably try to add Relig...",0,0,0,0,0,0
3,0003e1cccfd5a40a,""" \n\n It says it right there that it IS a typ...",0,0,0,0,0,0
4,00059ace3e3e9a53,""" \n\n == Before adding a new product to the l...",0,0,0,0,0,0


In [7]:
# Creating a function to clean the training dataset
def clean_text(text):
    """This function will take text as input and return a cleaned text 
        by removing html char, punctuations, non-letters, newline and converting it 
        to lower case.
    """
    # Converting to lower case letters
    text = text.lower()
    # Removing the contraction of few words
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    # Replacing the HTMl characters with " "
    text = re.sub("<.*?>", " ", text)
    # Removing the punctuations
    text = text.translate(str.maketrans(" ", " ", string.punctuation))
    # Removing non-letters
    text = re.sub("[^a-zA-Z]", " ", text)
    # Replacing newline with space
    text = re.sub("\n", " ", text)
    # Split on space and rejoin to remove extra spaces
    text = " ".join(text.split())
    
    return text

In [8]:
def word_lemmatizer(text):
    """This function will help lemmatize words in a text.
    """
    
    lemmatizer = WordNetLemmatizer()
    # Tokenize the sentences to words
    text = word_tokenize(text)
    # Removing the stop words
    text = [lemmatizer.lemmatize(word) for word in text]
    # Joining the cleaned list
    text = " ".join(text)
    
    return text

In [9]:
# Cleaning and preprocessing the train data
train_df["comment_text"] = train_df["comment_text"].swifter.apply(clean_text)
train_df["comment_text"] = train_df["comment_text"].swifter.apply(word_lemmatizer)

# Cleaning and preprocessing the test data
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(clean_text)
new_test_df["comment_text"] = new_test_df["comment_text"].swifter.apply(word_lemmatizer)

Pandas Apply:   0%|          | 0/159571 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/159571 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/63978 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/63978 [00:00<?, ?it/s]

In [26]:
# Performing the train-val split to create training and validation datasets
train, validation = train_test_split(train_df, test_size=0.2, random_state=42)
# print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)
print(train.shape, validation.shape)

(106912, 8) (52659, 8)


In [27]:
 # Seperating our input and target variable columns
X_train = train.comment_text
X_val = validation.comment_text

In [17]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

In [83]:
word_tfidf = TfidfVectorizer(max_features=10000, ngram_range=(1, 1), sublinear_tf=True, strip_accents="unicode", 
                             analyzer="word",token_pattern=r"\w{1,}", stop_words=stop_words)
char_tfidf = TfidfVectorizer(max_features=50000, ngram_range=(2, 6), sublinear_tf=True, strip_accents="unicode", 
                             analyzer="char", stop_words=stop_words)
tfidf = FeatureUnion([('char', char_tfidf), ('word', word_tfidf)])

In [None]:
LogReg_pipeline = Pipeline([
                ('tfidf', tfidf),
                ('clf', OneVsRestClassifier(LogisticRegression(solver='saga'), n_jobs=1)),
            ])

LogReg_pipeline.fit(X_train, train[labels])

In [86]:
# Predicting the label values for our validation dataset
y_pred = LogReg_pipeline.predict(X_val)

In [87]:
# Computing the f-1 score and accuracy for our model.
f1_score_lr = f1_score(validation[labels], y_pred, average = "weighted")
print(f1_score_lr)

accuracy_lr = accuracy_score(validation[labels], y_pred)
print("Accuracy: ", accuracy_lr)

0.7093715383589738
Accuracy:  0.9218367230672819


In [89]:
# Predicting the labels for the unseen test dataset
y_test_pred = LogReg_pipeline.predict(new_test_df.comment_text)

In [90]:
# Computing the f-1 score for our model.
f1_score_lr = f1_score(new_test_df[labels], y_test_pred, average = "weighted")
print(f1_score_lr)

# Computing Accuracy for our model
accuracy = accuracy_score(new_test_df[labels], y_test_pred)
print("Accuracy: ", accuracy)

0.6525803936392597
Accuracy:  0.8970271030666792


In [92]:
# Saving our model pipeline
import joblib
joblib.dump(LogReg_pipeline, "01-logistic-regression-92.pkl")

['01-logistic-regression-92.pkl']