In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import nltk
import gensim

In [2]:
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
train['comment_text'].fillna("", inplace=True)
test['comment_text'].fillna("", inplace=True)

In [4]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


<h3> Baseline Logistic Regression Model Submission </h3>
Using TF-IDF Vectors (with max features = 20000) and simple LR model 

In [17]:
eng_stopwords = stopwords.words('english')
vectorizer = TfidfVectorizer(input='content', max_features=20000, 
                             analyzer='word', ngram_range=(1,3), 
                             stop_words=eng_stopwords, min_df=3, max_df=0.9)
trn_term_doc = vectorizer.fit_transform(train['comment_text'])
test_term_doc = vectorizer.fit_transform(test['comment_text'])

In [6]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1 - train[label_cols].max(axis=1)

In [7]:
models = {}
for class_label in label_cols:
    print("Training: " + class_label)
    class_model = LogisticRegression()
    class_model.fit(trn_term_doc, train[class_label])
    models[class_label] = class_model

Training: toxic
Training: severe_toxic
Training: obscene
Training: threat
Training: insult
Training: identity_hate


In [8]:
for class_label in label_cols:
    print("Testing: " + class_label)
    pred_probs = models[class_label].predict_proba(test_term_doc)
    test[class_label] = map(lambda x: x[1], pred_probs)
test = test.drop('comment_text', axis=1)
test.to_csv('baseline_submission.csv', index=False)

Testing: toxic
Testing: severe_toxic
Testing: obscene
Testing: threat
Testing: insult
Testing: identity_hate


<h3> Trying Boosting (spcifically GBM) </h3>
Realized that building a GBM model with require a lot of time and resources, hence will try some other things befire going to GBM

<p>Clean Comment Text and reiterate</p>

In [5]:
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

def clean(comment):
    comment = comment.lower()
    comment = re.sub("\\n","", comment)
    
    # remove IP addresses
    comment=re.sub("\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}","",comment)
    # remove usernames
    comment=re.sub("\[\[.*\]","",comment)
    
    #Split the sentences into words
    words = word_tokenize(comment)
    
    words=[lem.lemmatize(word, "v") for word in words]
    words = [w for w in words if not w in eng_stopwords]
    
    clean_sent=" ".join(words)

    return(clean_sent)

<h2>Training Word2Vec own model with Gensim</h2>

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object