### Problem statement

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/overview/description

In [14]:
import numpy as np
import pandas as pd
from scipy import sparse

In [15]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Tweet tokenizer does not split at apostophes which is what we want
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator,TransformerMixin

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

import Utils

In [16]:
import pickle

#### Dataset:

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

In [17]:
train = pd.read_csv('Data/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [18]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [19]:
train.isnull().sum().sum()

0

In [20]:
test = pd.read_csv('Data/test.csv')
test.head()

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.


In [21]:
test.isnull().sum().sum()

0

In [22]:
class_list =train.columns.difference(['id','comment_text']).to_list()
print(class_list)

['identity_hate', 'insult', 'obscene', 'severe_toxic', 'threat', 'toxic']


### Text Cleaning

In [23]:
#https://drive.google.com/file/d/0B1yuv8YaUVlZZ1RzMFJmc1ZsQmM/view
# Aphost lookup dict
APPO = {
"aren't" : "are not",
"can't" : "cannot",
"couldn't" : "could not",
"didn't" : "did not",
"doesn't" : "does not",
"don't" : "do not",
"hadn't" : "had not",
"hasn't" : "has not",
"haven't" : "have not",
"he'd" : "he would",
"he'll" : "he will",
"he's" : "he is",
"i'd" : "I would",
"i'd" : "I had",
"i'll" : "I will",
"i'm" : "I am",
"isn't" : "is not",
"it's" : "it is",
"it'll":"it will",
"i've" : "I have",
"let's" : "let us",
"mightn't" : "might not",
"mustn't" : "must not",
"shan't" : "shall not",
"she'd" : "she would",
"she'll" : "she will",
"she's" : "she is",
"shouldn't" : "should not",
"that's" : "that is",
"there's" : "there is",
"they'd" : "they would",
"they'll" : "they will",
"they're" : "they are",
"they've" : "they have",
"we'd" : "we would",
"we're" : "we are",
"weren't" : "were not",
"we've" : "we have",
"what'll" : "what will",
"what're" : "what are",
"what's" : "what is",
"what've" : "what have",
"where's" : "where is",
"who'd" : "who would",
"who'll" : "who will",
"who're" : "who are",
"who's" : "who is",
"who've" : "who have",
"won't" : "will not",
"wouldn't" : "would not",
"you'd" : "you would",
"you'll" : "you will",
"you're" : "you are",
"you've" : "you have",
"'re": " are",
"wasn't": "was not",
"we'll":" will",
"didn't": "did not",
"tryin'":"trying"
}

In [24]:
tokenizer = TweetTokenizer()

lematizer = WordNetLemmatizer()

In [25]:
%%time 
#### Uncomment this when running for first time
#### it generates cleaned train set

#train_x = train['comment_text'].apply(lambda x: Utils.text_clean(x,tokenizer,lematizer,APPO))
#train_x.to_csv('Data/train_x.csv')

Wall time: 0 ns


In [27]:
train_x = pd.read_csv('Data/train_x.csv')

In [28]:
train_x.head(3)

Unnamed: 0.1,Unnamed: 0,comment_text
0,0,explanationwhy edit make username hardcore met...
1,1,d'aww ! match background colour I am seemingly...
2,2,"hey man , I am really try edit war . it is guy..."


In [29]:
%%time
#### Uncomment this when running for first time
#### it generates cleaned test set
#test_x = test['comment_text'].apply(lambda x: Utils.text_clean(x,tokenizer,lematizer,APPO))
#test_x.to_csv('Data/test_x.csv')

Wall time: 0 ns


In [31]:
test_x = pd.read_csv('Data/test_x.csv')

In [32]:
test_x.head(3)

Unnamed: 0.1,Unnamed: 0,comment_text
0,0,yo bitch ja rule succesful you will ever whats...
1,1,"= = rfc = = title fine , imo ."
2,2,""" = = source = = * zawe ashton lapland — / """


In [33]:
Submission = pd.DataFrame()
Submission['id'] = test['id']

Submission.head(3)

Unnamed: 0,id
0,00001cee341fdb12
1,0000247867823ef7
2,00013b17ad220c46


## Model Building

### 1. Classifier to predict toxic labels

In [34]:
#pipe = Pipeline(steps = [('lemmatize',Lemmatizer()),('tfidf',TfidfVectorizer(max_features=2500)),('logreg',LogisticRegression(solver='liblinear'))])
        
#cross_val_score(pipe,x,y,cv=3,scoring='roc_auc').mean()

In [29]:
%%time
y=train['toxic']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'toxic')

with open('Models/toxic.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [32]:
model.cv_results_

### 2. Classifier to predict severe_toxic labels

In [None]:
%%time 
y=train['severe_toxic']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'severe_toxic')

with open('Models/severe_toxic.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [None]:
model.cv_results_

### 3. Classifier to predict obscene labels

In [None]:
%%time 
y=train['obscene']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'obscene')

with open('Models/obscene.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [None]:
model.cv_results_

### 4. Classifier to predict threat labels

In [None]:
%%time 
y=train['threat']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'threat')

with open('Models/threat.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [None]:
model.cv_results_

### 5. Classifier to predict insult labels

In [None]:
%%time 
y=train['insult']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'insult')

with open('Models/insult.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [None]:
model.cv_results_

### 6. Classifier to predict identity_hate labels

In [None]:
%%time 
y=train['identity_hate']

model = Utils.build_pipeline(train_x, y, cv=5, scoring='roc_auc')

Submission = Utils.make_prediction(model,test_x,Submission,'identity_hate')

with open('Models/identity_hate.pkl','wb') as model_pkl:
    pickle.dump(model,model_pkl,protocol=2)

In [None]:
model.cv_results_

### Submit Test Score

In [None]:
file='Submission/submission.csv'

Submission.to_csv(file,index=False)