In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
import spacy
nlp = spacy.load("en_core_web_sm")
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv').fillna(' ')

In [3]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')
holdout.drop(holdout[holdout['toxic']==-1].index, inplace=True)

In [4]:
# train['char_total'] = train['comment_text'].map(lambda x: len(x))
# train = train[train['char_total']<2000]
# train.drop(['char_total'], axis=1, inplace=True)

In [20]:
re.findall("-Preceding unsigned comment added by", ' '.join(train['comment_text']))

[]

In [5]:
# lower everything
train['comment_text'] = train['comment_text'].map(lambda x: x.lower())

# remove '\\n'
train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(https://.*)",'',str(x)))

#remove email addresses 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))

#remove WP:__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("wp:\w*",'',str(x)))

#remove user::__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("user::\w*",'',str(x)))

#remove all websites
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$",'',str(x)))

#remove all these auto-generated messages
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("preceding unsigned comment added by",'',str(x)))

#remove all punctuation
train['comment_text'] = train['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation))

In [6]:
# remove '\\n'
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\[\[User.*",'',str(x)))
    
# remove IP addresses or user IDs
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(http://.*)",'',str(x)))

#remove email addresses 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))


In [7]:
def remove_all_digits(comment):
    result = ''.join([i for i in comment if not i.isdigit()])
    return result

In [8]:
train['comment_text'] = train['comment_text'].apply(remove_all_digits)
holdout['comment_text'] = holdout['comment_text'].apply(remove_all_digits)

In [9]:
def lemmatize(comment):
    lems = [token.lemma_ for token in comment if (token.lemma_ != '-PRON-' and 
                                   token.is_punct==False and token.is_space==False 
                                          and not token.lemma_.startswith("'"))]
    return lems

In [10]:
train_bunch_df = train.sample(50000, random_state=0)
train_bunch = train_bunch_df['comment_text']
train_bunch_nlp = map(nlp, train_bunch)
train_bunch_nlp_lem = map(lemmatize, train_bunch_nlp)
train_bunch_nlp_lem_joined = list(map(' '.join, train_bunch_nlp_lem))

train_bunch_labels = train_bunch_df.iloc[:,2:]

In [11]:
list(train_bunch)[0]

'" I haven\'t paraphrased you at all, Gary.  You complained that preferring recent sources to those ""well over  years old"" is ""recentism"".  I pointed out that it is strongly encouraged by MEDRS, and for good reason. Again, if you take issue with that, then you need to raise it at the appropriate talk page.   "'

In [12]:
train_bunch_nlp_lem_joined[0]

'have not paraphrase at all gary complain that prefer recent source to those well over year old be recentism point out that be strongly encourage by medrs and for good reason again if take issue with that then need to raise at the appropriate talk page'

In [13]:
X_train = train_bunch_nlp_lem_joined
y_train = train_bunch_labels

In [14]:
test_bunch_df = holdout.sample(25000, random_state=0)
test_bunch = test_bunch_df['comment_text']
test_bunch_nlp = map(nlp, test_bunch)
test_bunch_nlp_lem = map(lemmatize, test_bunch_nlp)
test_bunch_nlp_lem_joined = list(map(' '.join, test_bunch_nlp_lem))

test_bunch_labels = test_bunch_df.iloc[:,2:]

In [15]:
X_test = test_bunch_nlp_lem_joined
y_test = test_bunch_labels

In [16]:
stopwords_list = stopwords.words('English') + list(string.punctuation) + ["''", '""', '...', '``']
stopwords_list += ['january', 'february', 'march', 'april', 'may', 'june', 'july', 'august', 'september', 'october',
                  'november', 'december']
stopwords_list += ['jan', 'feb', 'mar', 'apr', 'jun', 'jul', 'aug', 'sep', 'oct', 'nov', 'dec', 'utc']

In [17]:
word_vectorizer = TfidfVectorizer(stop_words=stopwords_list, token_pattern="([a-zA-Z]+(?:'[a-z]+)?)")
# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

In [18]:
class_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

losses = []
auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='liblinear')

    cv_loss = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=3, scoring='neg_log_loss'))
    losses.append(cv_loss)
    print('CV Log_loss score for class {} is {}'.format(class_name, cv_loss))
    
    cv_score = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=3, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(X_train_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

CV Log_loss score for class toxic is -0.14428029957498625
CV Accuracy score for class toxic is 0.9467799999574232
CV ROC_AUC score 0.9554741303022611

CV Log_loss score for class severe_toxic is -0.031905787691422834
CV Accuracy score for class severe_toxic is 0.9899400019919918
CV ROC_AUC score 0.9841883966424407

CV Log_loss score for class obscene is -0.08091604622176785
CV Accuracy score for class obscene is 0.9722799855775355
CV ROC_AUC score 0.9722639683096521

CV Log_loss score for class threat is -0.014471298970617717
CV Accuracy score for class threat is 0.9971799999977439
CV ROC_AUC score 0.9807425050975185

CV Log_loss score for class insult is -0.09485052641216762
CV Accuracy score for class insult is 0.9657399999725915
CV ROC_AUC score 0.9650012870916409

CV Log_loss score for class identity_hate is -0.03264027705141962
CV Accuracy score for class identity_hate is 0.9913599963930159
CV ROC_AUC score 0.9759565214101674

Total average CV Log_loss score is -0.0665107059870636