# Import

In [1]:
import pandas as pd
import numpy as np
import string
import re
import random
import nltk
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

# Load Data

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')
holdout.drop(holdout[holdout['toxic']==-1].index, inplace=True)

# Remove Spam (Optional)

In [4]:
# train['char_total'] = train['comment_text'].map(lambda x: len(x))
# train = train[train['char_total']<4000]
# train.drop(['char_total'], axis=1, inplace=True)

# Process Raw Text Function

In [5]:
# def preprocess(comment):
#     # lower everything
#     comment = comment.lower()
#     #get rid of new line symbols
#     comment = re.sub('\\n',' ',comment)
#     #expand out all hyphens
#     comment = re.sub("-", ' ', comment)
#     #remove user:: fragments
#     comment = re.sub("user::\w*",' ',comment)
#     #remove anything with user
#     comment = re.sub("\[\[user.*",' ',comment)
#     #remove IP addresses
#     comment = re.sub("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}",' ',comment)
#     #remove http links
#     comment = re.sub("(http://.*?\s)|(http://.*)",' ',comment)
#     #remove https links
#     comment = re.sub("(https://.*?\s)|(https://.*)",' ',comment)
#     #remove email addresses
#     comment = re.sub("(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)",' ',comment)
#     #remove wp: fragments
#     comment = re.sub("wp:\w*",' ',comment)
#     #remove these auto generated strings
#     comment = re.sub("preceding unsigned comment added by",' ',comment)
#     #remove all punctuation besides '
# #     string_w_o_comma = re.sub("'", '', string.punctuation)
# #     comment = comment.translate(str.maketrans('','', string_w_o_comma))
#     return comment

In [6]:
# train['comment_text'] = train['comment_text'].apply(preprocess)
# holdout['comment_text'] = holdout['comment_text'].apply(preprocess)

# Expand Contractions

In [7]:
contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "can not", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'll": "how will", "how's": "how is",  "i'd": "i would", "i'll": "i will", "i'm": "i am", "i've": "i have", "i'd": "i would", "i'll": "i will", "i'm": "i am", "isn't": "is not", "it'd": "it would", "it'll": "it will","it's": "it is", "let's": "let us", "ma'am": "madam", "might've": "might have","mightn't": "might not", "must've": "must have", "mustn't": "must not", "needn't": "need not", "shan't": "shall not", "she'd": "she would", "she'll": "she will", "she's": "she is", "should've": "should have", "shouldn't": "should not", "so've": "so have","that'd": "that would", "that's": "that is", "there's": "there is", "here's": "here is","they'd": "they would","they'll": "they will", "they're": "they are", "they've": "they have", "wasn't": "was not", "we'd": "we would", "we'll": "we will", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "y'all": "you all","you'd": "you would", "you'll": "you will", "you're": "you are", "you've": "you have"}

In [8]:
# for replacee, replacer in contraction_dict.items():
#     train['comment_text'] = train['comment_text'].map(lambda x: x.replace(replacee, replacer))
#     holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.replace(replacee, replacer))

In [9]:
# for replacee, replacer in contraction_dict.items():
#     train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\b'+replacee+'\b',replacer,str(x)))
# #     holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub('\b'+replacee+'\b',replacer,str(x)))


# Map Out Mis-spellings, etc.

In [10]:
# normalization_dict = {'admins':'admin'}

In [11]:
# for replacee, replacer in normalization_dict.items():
#     train['comment_text'] = train['comment_text'].map(lambda x: x.replace(replacee, replacer))
#     holdout['comment_text'] = holdout['comment_text'].map(lambda x: x.replace(replacee, replacer))

In [12]:
# for replacee, replacer in normalization_dict.items():
#     train['comment_text'] = train['comment_text'].map(lambda x: re.sub('\b'+replacee+'\b',replacer,str(x)))
# #     holdout['comment_text'] = holdout['comment_text'].map(lambda x: re.sub('\b'+replacee+'\b',replacer,str(x)))


# Finish Preprocessing Function

In [13]:
def remove_digits_punctuation(comment):
    comment = ''.join([i for i in comment if not i.isdigit()])
#     comment = comment.translate(str.maketrans('', '', string.punctuation))
    return comment

In [14]:
# train['comment_text'] = train['comment_text'].apply(remove_digits_punctuation)
# holdout['comment_text'] = holdout['comment_text'].apply(remove_digits_punctuation)

# Tokenize

In [15]:
# list_of_token_lists = [regexp_tokenize(x,pattern="([a-zA-Z]+(?:'[a-z]+)?)") for x in train['comment_text']]
# # list_of_token_lists = [regexp_tokenize(x,pattern=r'\w{1,}') for x in train['comment_text']]
# # test_list_of_token_lists = [regexp_tokenize(x,pattern=r'\w{1,}') for x in holdout['comment_text']]
# test_list_of_token_lists = [regexp_tokenize(x,pattern="([a-zA-Z]+(?:'[a-z]+)?)") for x in holdout['comment_text']]


In [16]:
# tokens = [word_tokenize(x) for x in train['comment_text']]
# tokens_test = [word_tokenize(x) for x in holdout['comment_text']]

In [17]:
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True)
tokens = [tknzr.tokenize(x) for x in train['comment_text']]
tokens_test = [tknzr.tokenize(x) for x in holdout['comment_text']]

# Remove Stop Words

In [18]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [19]:
stops = stopwords.words('english')
stops += ['youre', 'dont', 'wont', 'ure', 'hes', 'uve', 'ive', 'ull', 'youll', 'youre', 'its', 'thats', 'shes',
         'whats', 'dont', 'isnt', 'doesnt', 'utc', 'wouldnt', 'hasnt', 'shouldnt', 'havent', 've', 'nt', 'll', 're',
         'wikipedia', 'admin', 'admins', 'wiki', 'contrib', 'contribs']

In [20]:
# token_lists_stopwords_removed = [[token for token in tokens if token not in stops and 
#                                   len(token)>1 and len(token)<20] for tokens in list_of_token_lists]
# test_tokens_stopwords_removed = [[token for token in tokens if token not in stops and 
#                                   len(token)>1 and len(token)<20] for tokens in test_list_of_token_lists]
# # for tokens in list_of_token_lists:
# #     no_stops = [token for token in tokens if token not in stopwords.words('english')]
# #     token_lists_stopwords_removed.append(no_stops)

# Stem Tokens

In [21]:
snow = PorterStemmer()
list_of_stems_lists = [[snow.stem(token) for token in tokens] for tokens in tokens]
test_list_of_stems_lists = [[snow.stem(token) for token in tokens] for tokens in tokens_test]

In [22]:
stems_for_tfidf = list(map(' '.join, list_of_stems_lists))
test_stems_for_tfidf = list(map(' '.join, test_list_of_stems_lists))

In [23]:
# snowball = SnowballStemmer('english')
# list_of_stems_lists = [[snowball.stem(token) for token in tokens] for tokens in token_lists_stopwords_removed]
# test_list_of_stems_lists = [[snowball.stem(token) for token in tokens] for tokens in test_tokens_stopwords_removed]

In [24]:
# stems_for_tfidf = list(map(' '.join, list_of_stems_lists))
# test_stems_for_tfidf = list(map(' '.join, test_list_of_stems_lists))

# Prepare X and Y

In [25]:
X_train = stems_for_tfidf
y_train = train.iloc[:,2:]

X_test = test_stems_for_tfidf
y_test = holdout.iloc[:,2:]

In [26]:
# Instantiate the vectorizer
word_vectorizer = TfidfVectorizer()

# fit and transform on it the training features
word_vectorizer.fit(X_train)
X_train_word_features = word_vectorizer.transform(X_train)

#transform the test features to sparse matrix
test_features = word_vectorizer.transform(X_test)

In [30]:
class_names = ['toxic','severe_toxic','obscene', 'threat', 'insult', 'identity_hate']

auc = []

for class_name in class_names:
    #call the labels one column at a time so we can run the classifier on them
    train_target = y_train[class_name]
    test_target = y_test[class_name]
    classifier = LogisticRegression(solver='liblinear', class_weight='balanced')

    
    classifier.fit(X_train_word_features, train_target)
    y_pred_prob = classifier.predict_proba(test_features)[:, 1]
    auc_score = metrics.roc_auc_score(test_target, y_pred_prob)
    auc.append(auc_score)
    print("CV ROC_AUC score {}\n".format(auc_score))
    
print('Total average CV ROC_AUC score is {}'.format(np.mean(auc)))

CV ROC_AUC score 0.961631518958301

CV ROC_AUC score 0.9847925724634965

CV ROC_AUC score 0.9763803112877611

CV ROC_AUC score 0.9880453401256366

CV ROC_AUC score 0.9688498160485424

CV ROC_AUC score 0.9813306541987692

Total average CV ROC_AUC score is 0.9768383688470846


In [31]:
from sklearn.multioutput import MultiOutputClassifier

In [32]:
pipe_lr = make_pipeline(
    TfidfVectorizer(sublinear_tf=1, max_features=50000),
    MultiOutputClassifier(LogisticRegression(solver='sag', C=2))
)

In [33]:
cv_score = np.mean(cross_val_score(pipe_lr, X_train, y_train, cv=3, scoring='roc_auc'))
cv_score

0.9821849132956636

# Pipe Line

In [31]:
stops = list(string.punctuation)

In [36]:
pipe_lr = make_pipeline(
    TfidfVectorizer(sublinear_tf=1, max_features=50000),
    OneVsRestClassifier(LogisticRegression(solver='sag', C=2))
)

In [37]:
cv_score = np.mean(cross_val_score(pipe_lr, X_train, y_train, cv=3, scoring='roc_auc'))
cv_score

0.9823539194380512

In [38]:
pipe_lr.fit(X_train, y_train)
test_preds = pipe_lr.predict_proba(X_test)

In [39]:
metrics.roc_auc_score(y_test, test_preds)

0.9779966786867834

In [None]:
tfidfstring = 'tfidfvectorizer'
onevrstring = 'onevsrestclassifier'
param_grid_lr = {
    tfidfstring+'__max_features': [35000,50000],
    tfidfstring+'__strip_accents': ['ascii','unicode', None],
#     tfidfstring+'__sublinear_tf': [True, False],
#     onevrstring+'__estimator__C': [2],
#     onevrstring+'__estimator__solver': ['sag','liblinear']  
             }

In [None]:
grid_lr = GridSearchCV(pipe_lr, param_grid_lr, cv=3, scoring='roc_auc')
grid_results_lr = grid_lr.fit(X_train, y_train)

In [None]:
grid_results_lr.best_params_

In [None]:
grid_results_lr.best_score_