In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
import random
from nltk import word_tokenize, regexp_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
train['char_total'] = train['comment_text'].map(lambda x: len(x))
train = train[train['char_total']<4000]
train.drop(['char_total'], axis=1, inplace=True)

In [4]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')
holdout.drop(holdout[holdout['toxic']==-1].index, inplace=True)

In [5]:
# lower everything
train['comment_text'] = train['comment_text'].map(lambda x: x.lower())

# remove '\\n'
train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\[\[user.*",'',str(x)))
    
# remove IP addresses or user IDs
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(https://.*)",'',str(x)))

#remove email addresses 
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))

#remove WP:__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("wp:\w*",'',str(x)))

#remove user::__
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("user::\w*",'',str(x)))

#remove all websites
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$",'',str(x)))

#remove all these auto-generated messages
train['comment_text'] = train['comment_text'].map(lambda x: re.sub("preceding unsigned comment added by",'',str(x)))

#remove all punctuation
train['comment_text'] = train['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [6]:
# lower everything
holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.lower())

# remove '\\n'
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub('\\n',' ',str(x)))
    
# remove any text starting with User... 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\[\[user.*",'',str(x)))
    
# remove IP addresses or user IDs
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",'',str(x)))
    
#remove http links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(http://.*?\s)|(http://.*)",'',str(x)))

#remove https links in the text
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(https://.*?\s)|(https://.*)",'',str(x)))

#remove email addresses 
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",'',str(x)))

#remove WP:__
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("wp:\w*",'',str(x)))

#remove user::__
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("user::\w*",'',str(x)))

#remove all websites
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("^(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?$",'',str(x)))

#remove all these auto-generated messages
holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub("preceding unsigned comment added by",'',str(x)))

#remove all punctuation
holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.translate(str.maketrans('', '', string.punctuation)))

In [7]:
def remove_all_digits(comment):
    result = ''.join([i for i in comment if not i.isdigit()])
    return result

train['comment_text'] = train['comment_text'].apply(remove_all_digits)
holdout['comment_text'] = holdout['comment_text'].apply(remove_all_digits)

In [12]:
list_of_token_lists = [regexp_tokenize(x,pattern=r'\w{2,}') for x in train['comment_text']]

In [13]:

# flat = [item for items in list_of_token_lists for item in items]

# flat = [token for token in flat if len(token)<50]

In [14]:
# lens = [len(token) for token in flat]

In [15]:
# max(lens)

In [16]:
# porter = PorterStemmer()
# all_stems = []
# for token in flat:
#     s

In [17]:
porter = PorterStemmer()
list_of_stems_lists = []
for tokens in list_of_token_lists:
    stems = [porter.stem(token) for token in tokens]
    list_of_stems_lists.append(stems)

In [13]:
# # porter = PorterStemmer()
# snowball = SnowballStemmer('english')
# list_of_stems_lists = [[snowball.stem(token) for token in token_list] for token_list in list_of_token_lists]

In [18]:
stems_for_tfidf = list(map(' '.join, list_of_stems_lists))

In [19]:
X_train = stems_for_tfidf
y_train = train.iloc[:,2:]

In [20]:
list_of_token_lists_test = [regexp_tokenize(x,pattern=r'\w{2,}') for x in holdout['comment_text']]
list_of_stems_lists_test = [[porter.stem(token) for token in token_list] for token_list in list_of_token_lists_test]
stems_for_tfidf_test = list(map(' '.join, list_of_stems_lists_test))

In [21]:
X_test = stems_for_tfidf_test
y_test = holdout.iloc[:,2:]

In [27]:
# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word',
                                  use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

In [28]:
class_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

losses = []
auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='liblinear')

    cv_loss = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=5, scoring='neg_log_loss'))
    losses.append(cv_loss)
    print('CV Log_loss score for class {} is {}'.format(class_name, cv_loss))
    
    cv_score = np.mean(cross_val_score(classifier, X_train_word_features, train_target, cv=5, scoring='accuracy'))
    print('CV Accuracy score for class {} is {}'.format(class_name, cv_score))
    
    classifier.fit(X_train_word_features, train_target)
    y_pred = classifier.predict(test_features)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
print('Total average CV Log_loss score is {}'.format(np.mean(losses)))
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

CV Log_loss score for class toxic is -0.11775666308745669
CV Accuracy score for class toxic is 0.9569986753295906
CV ROC_AUC score 0.9610830361170792

CV Log_loss score for class severe_toxic is -0.02733017904575455
CV Accuracy score for class severe_toxic is 0.9907777707689396
CV ROC_AUC score 0.9829600787518241

CV Log_loss score for class obscene is -0.06334644081588212
CV Accuracy score for class obscene is 0.9779852393868669
CV ROC_AUC score 0.9752797641384098

CV Log_loss score for class threat is -0.011814697516991676
CV Accuracy score for class threat is 0.9971551125969847
CV ROC_AUC score 0.9869410904048856

CV Log_loss score for class insult is -0.08218120562130721
CV Accuracy score for class insult is 0.9694568851321517
CV ROC_AUC score 0.9668109552083073

CV Log_loss score for class identity_hate is -0.02720609078843697
CV Accuracy score for class identity_hate is 0.9919699741373872
CV ROC_AUC score 0.979651148334995

Total average CV Log_loss score is -0.054939212812638194