In [2]:
# https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline

import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import re, string
import time
from scipy.sparse import hstack, vstack

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

In [3]:
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

In [4]:
# read data
train = pd.read_csv('~/data/toxic/data/train_preprocessed_clean.csv') #train.csv')#
test = pd.read_csv('~/data/toxic/data/test_preprocessed_clean.csv') #test.csv')#
subm = pd.read_csv('~/data/toxic/data/sample_submission.csv')

In [5]:
train.head(2)

Unnamed: 0,comment_text,id,identity_hate,insult,obscene,set,severe_toxic,threat,toxic,toxicity,comment_text_polarity
0,explanation why the edits made under my userna...,0000997932d777bf,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0,explanation why the edits made under my userna...
1,d aww he matches this background colour i m se...,000103f0d9cfb60f,0.0,0.0,0.0,train,0.0,0.0,0.0,0.0,d aww he matches this background colour i m se...


In [6]:
id_train = train['id'].copy()
id_test = test['id'].copy()

# add empty label for None
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
train['none'] = 1-train[label_cols].max(axis=1)
# fill missing values
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

In [7]:
# Tf-idf

# prepare tokenizer
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

# create sparse matrices
n = train.shape[0]
# vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
#                       min_df=3, max_df=0.9, strip_accents='unicode', 
#                       use_idf=1, smooth_idf=1, sublinear_tf=1)
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize, 
                      min_df=3, max_df=0.9, strip_accents='unicode', 
                      use_idf=0, smooth_idf=0, sublinear_tf=0)
#vec.fit(pd.concat([train[COMMENT], test[COMMENT]]))
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

In [8]:
len(vec.vocabulary_)

392354

In [75]:
import gc
gc.collect()

0

In [9]:
trn_term_doc.shape, test_term_doc.shape

((159571, 392354), (153164, 392354))

In [10]:
ytrain = np.zeros((trn_term_doc.shape[0],1)) + 1
ytest = np.zeros((test_term_doc.shape[0],1))
ydat = np.vstack((ytrain, ytest))

xdat = vstack([trn_term_doc, test_term_doc], format='csr')

In [11]:
xdat.shape, ydat.shape

((312735, 392354), (312735, 1))

In [31]:
xdat.shape, ydat.shape

((312735, 392354), (312735, 1))

In [17]:
nfolds = 10
xseed = 1001
cval = 4

# stratified split
skf = StratifiedKFold(n_splits= nfolds, random_state= xseed)
score_vec = np.zeros((nfolds,1))

index_list = {}
for (f, (train_index, test_index)) in enumerate(skf.split(xdat, ydat[:,0])):
    # split 
    x0, x1 = xdat[train_index], xdat[test_index]
    y0, y1 = ydat[train_index,0], ydat[test_index,0]    

    clf = LogisticRegression()
    clf.fit(x0,y0)
    prv = clf.predict_proba(x1)[:,1]
    roc_auc = roc_auc_score(y1,prv)
    print(roc_auc)
    score_vec[f,:] = roc_auc
    index_list[f] = (train_index, test_index, roc_auc)

In [25]:
sorted_res = sorted(index_list.items(), key=lambda x: x[1][2])

In [27]:
sorted_res[0]

(3,
 (array([     0,      1,      2, ..., 312732, 312733, 312734]),
  array([ 47872,  47873,  47874, ..., 220836, 220837, 220838]),
  0.6749522404398427))

In [28]:
type(sorted_res[0][1][0])

numpy.ndarray

In [29]:
np.save('train_index', sorted_res[0][1][0])

In [30]:
np.save('test_index', sorted_res[0][1][1])

In [None]:
# if using original train (unprocessed), this is what you get:
# [ 0.89780942]
# [ 0.89616252]
# [ 0.89593279]
# [ 0.89684945]
# [ 0.89413953]

# if using preprocessed, word ngram = 1,1
# [ 0.47143622]
# [ 0.46891485]
# [ 0.46713837]
# [ 0.47014438]
# [ 0.46990587]

# if using preprocessed, word ngram = 1,2
# [ 0.68330226]
# [ 0.67646079]
# [ 0.68069628]
# [ 0.6821993]
# [ 0.67954349]