In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import re
import string

import pickle 
#import mglearn
import time


from nltk.tokenize import TweetTokenizer # doesn't split at apostrophes
import nltk
from nltk import Text
from nltk.tokenize import regexp_tokenize
from nltk.tokenize import word_tokenize  
from nltk.tokenize import sent_tokenize 
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression 
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier


from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')
holdout.drop(holdout[holdout['toxic']==-1].index, inplace=True)

In [4]:
# train['char_total'] = train['comment_text'].map(lambda x: len(x))
# train = train[train['char_total']<2000]
# train.drop(['char_total'], axis=1, inplace=True)

In [5]:
# lower everything
train['comment_text'] = train['comment_text'].map(lambda x: x.lower())

# remove '\\n'
train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(https://.*)",'',str(x)))

#remove email addresses 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))

#remove WP:__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("wp:\w*",'',str(x)))

#remove user::__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("user::\w*",'',str(x)))

#remove all websites
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$",'',str(x)))

#remove all these auto-generated messages
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("preceding unsigned comment added by",'',str(x)))

#remove all punctuation
train['comment_text'] = train['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [6]:
# lower everything
holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.lower())

# remove '\\n'
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(https://.*)",'',str(x)))

#remove email addresses 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))

#remove WP:__
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("wp:\w*",'',str(x)))

#remove user::__
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("user::\w*",'',str(x)))

#remove all websites
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$",'',str(x)))

#remove all these auto-generated messages
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("preceding unsigned comment added by",'',str(x)))

#remove all punctuation
holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [7]:
def remove_all_digits(comment):
    result = ''.join([i for i in comment if not i.isdigit()])
    return result

In [8]:
train['comment_text'] = train['comment_text'].apply(remove_all_digits)
holdout['comment_text'] = holdout['comment_text'].apply(remove_all_digits)

In [9]:
# train_sample = train.sample(50000, random_state=0)
# test_sample = holdout.sample(25000, random_state=0)
X_train = train['comment_text']
y_train = train.iloc[:, 2:]
X_test = holdout['comment_text']
y_test = holdout.iloc[:, 2:]

In [10]:
stopwords_list = stopwords.words('English') + list(string.punctuation) + ["''", '""', '...', '``']
stopwords_list += ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october',
                  'november', 'december']
stopwords_list += ['jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'utc']

In [11]:
# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(stop_words='english', token_pattern=r'\w{2,}')

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

In [12]:
class_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

losses = []
auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='liblinear')

    cv_loss = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=5, scoring='neg_log_loss'))
    losses.append(cv_loss)
    print('CV Log_loss score for class {} is {}'.format(class_name, cv_loss))
    
    cv_score = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=5, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(X_train_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

CV Log_loss score for class toxic is -0.12582401755143963
CV Accuracy score for class toxic is 0.9540079334117537
CV ROC_AUC score 0.9582069370012736

CV Log_loss score for class severe_toxic is -0.029413328820513895
CV Accuracy score for class severe_toxic is 0.9904493942384776
CV ROC_AUC score 0.9825804509930656

CV Log_loss score for class obscene is -0.06909915233536419
CV Accuracy score for class obscene is 0.976280154686046
CV ROC_AUC score 0.9728234874997391

CV Log_loss score for class threat is -0.012163961316997022
CV Accuracy score for class threat is 0.9971298053476418
CV ROC_AUC score 0.9840944933037836

CV Log_loss score for class insult is -0.0853174051844743
CV Accuracy score for class insult is 0.9690921387889831
CV ROC_AUC score 0.9655307171192983

CV Log_loss score for class identity_hate is -0.0288908321732697
CV Accuracy score for class identity_hate is 0.9917967558729014
CV ROC_AUC score 0.9770736260881024

Total average CV Log_loss score is -0.058451449563676455
