In [9]:
import pandas as pd
import numpy as np
import string
import re
import random
import nltk
import pickle
from nltk.corpus import stopwords
from nltk import FreqDist
from nltk import word_tokenize, pos_tag, regexp_tokenize, TweetTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from nltk.stem import WordNetLemmatizer 
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

In [2]:
train = pd.read_csv('train.csv')
holdout = pd.read_csv('test.csv')

In [3]:
test_labels_df = pd.read_csv("test_labels.csv")
holdout = holdout.merge(test_labels_df, on='id')
holdout.drop(holdout[holdout['toxic']==-1].index, inplace=True)

In [4]:
tknzr = TweetTokenizer(preserve_case=False, reduce_len=True)
tokens = [tknzr.tokenize(x) for x in train['comment_text']]
tokens_test = [tknzr.tokenize(x) for x in holdout['comment_text']]

In [5]:
tokens_stopwords_removed = [[token for token in tokens if len(token)<50] for tokens in tokens]
tokens_stopwords_removed_test = [[token for token in tokens if len(token)<50] for tokens in tokens_test]

In [6]:
snow = PorterStemmer()
list_of_stems_lists = [[snow.stem(token) for token in tokens] for tokens in tokens_stopwords_removed]
test_list_of_stems_lists = [[snow.stem(token) for token in tokens] for tokens in tokens_stopwords_removed_test]

In [7]:
stems_for_tfidf = list(map(' '.join, list_of_stems_lists))
test_stems_for_tfidf = list(map(' '.join, test_list_of_stems_lists))

In [10]:
save_porter_tokens = open("porter_stemmed_tokens.pickle", "wb")
pickle.dump(stems_for_tfidf, save_porter_tokens)
save_porter_tokens.close()

In [11]:
save_porter_tokens_test = open("porter_stemmed_tokens_test.pickle", "wb")
pickle.dump(test_stems_for_tfidf, save_porter_tokens_test)
save_porter_tokens_test.close()

In [12]:
X_train = stems_for_tfidf
y_train = train.iloc[:,2:]

X_test = test_stems_for_tfidf
y_test = holdout.iloc[:,2:]

In [13]:
from xgboost import XGBClassifier

In [19]:
pipe_xg = make_pipeline(
    TfidfVectorizer(strip_accents='ascii', sublinear_tf=True, max_features=50000),
    OneVsRestClassifier(XGBClassifier(objective='binary:logistic', max_depth=4))
)

In [20]:
cv_score = np.mean(cross_val_score(pipe_xg, X_train, y_train, cv=3, scoring='roc_auc'))
cv_score

KeyboardInterrupt: 

In [15]:
tfidfstring = 'tfidfvectorizer'
onevrstring = 'onevsrestclassifier'
param_grid_xg = {
#     tfidfstring+'__max_features': [25000,50000,100000],
    onevrstring+'__estimator__max_depth': [50,75],
    onevrstring+'__estimator__min_samples_split':[6,8]
             }

In [16]:
grid_xg = GridSearchCV(pipe_xg, param_grid_xg, cv=3, scoring='roc_auc')
grid_results_xg = grid_xg.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_results_xg.best_params_

In [None]:
grid_results_xg.best_score_