In [44]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack


In [45]:
PATH = '../data/'

# train = pd.read_csv(PATH + 'cleaned_train_emoji.csv')
# test = pd.read_csv(PATH + 'cleaned_test_emoji.csv')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [46]:


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [47]:
# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}

In [48]:
train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


4535

In [6]:

import gc
gc.collect()

0

In [49]:
cols = {'C': [], 'naive-bayes':[], 'label':[], 
        'precision':[], 'tpr':[], 'fpr':[], 
        'ROC':[],'trained':[]}
df_svc = pd.DataFrame(cols)
########################################

C = [0.01,0.05,0.1,0.15,0.2,0.23,0.25,0.27,0.28,0.29,0.3,0.32,0.35,0.4,0.5,0.7,0.9,1,1.5,2,3,4,5,6,7,8]
NaiveB = ['Y', 'N']

index = 0
for col in label_cols:
    for cc in C:
        for nb in NaiveB:
            df_svc.loc[index, 'C'] = cc
            df_svc.loc[index, 'naive-bayes'] = nb
            df_svc.loc[index, 'label'] = col
            df_svc.loc[index, 'precision'] = 0
            df_svc.loc[index, 'tpr'] = 0
            df_svc.loc[index, 'fpr'] = 0
            df_svc.loc[index, 'ROC'] = 0
            df_svc.loc[index, 'trained'] = 'N'
            index += 1
                


df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False)                        

In [None]:
def get_parameters(df_svc, addition = None):
    params_list =['C']
    condition = (df_svc['trained'] == 'N')
    if addition is not None:
        for i in range(len(addition)):
            condition = condition &  (df_svc[addition[i]['key']] == addition[i]['value'])
#     df = df_lgb[(df_lgb['trained'] == 'N') & (df_lgb['label'] == col)]
    df = df_svc[condition]
    index = df.index.values
    np.random.shuffle(index)
    next_index = index[0]
    params = dict(df_svc.loc[next_index, params_list])
    ##### data type fix:
    return params, next_index

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, confusion_matrix

################# random search start
%env JOBLIB_TEMP_FOLDER=/tmp
base_params = {}
start = 0
for i in range(1000):
        start += 1
        # create dataset for lightgbm
        params = base_params.copy()  
        params_get, index_get = get_parameters(df_svc, addition=None)
        col = df_svc.loc[index_get, 'label']
        y_val = y_val_df[col].values
        y_train = y_train_df[col].values
        ###### check naive bayes
        nb = df_svc.loc[index_get, 'naive-bayes']
        if nb is 'Y':
            svc_train = x_train
            svc_val = x_val
            print('nb enabled')
        else:
            svc_train = train_set[col]
            svc_val = val_set[col]
            print('nb disabled')
        params.update(params_get)
        print(start)
        print(col)
        print(index_get)
        print(params)
        lsvc = LinearSVC(**params_get)
        model_svc = CalibratedClassifierCV(lsvc) 
        model_svc.fit(svc_train, y_train)
        pred_prob = model_svc.predict_proba(svc_val)[:,1]
        pred = model_svc.predict(svc_val)
        roc = roc_auc_score(y_val,pred_prob)
        tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
        precision = tp / (tp + fp)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)

        df_svc.loc[index_get, 'precision'] = precision
        df_svc.loc[index_get, 'tpr'] = tpr
        df_svc.loc[index_get, 'fpr'] = fpr
        df_svc.loc[index_get, 'ROC'] = roc
        df_svc.loc[index_get, 'trained'] = 'Y'
        print(df_svc.iloc[index_get])
        i += 1
        print('==================================================')
        df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False) 
        print('done')
        print('===================================')
    

env: JOBLIB_TEMP_FOLDER=/tmp
nb enabled
1
threat
170
{'C': 0.27000000000000002}
C                     0.27
ROC               0.992659
fpr            0.000565789
label               threat
naive-bayes              Y
precision         0.653846
tpr                   0.34
trained                  Y
Name: 170, dtype: object
done
nb enabled
2
obscene
104
{'C': 0.01}
C                    0.01
ROC               0.98775
fpr            0.00563175
label             obscene
naive-bayes             Y
precision        0.879774
tpr              0.719907
trained                 Y
Name: 104, dtype: object
done
nb enabled
3
threat
164
{'C': 0.20000000000000001}
C                      0.2
ROC               0.992964
fpr            0.000565789
label               threat
naive-bayes              Y
precision         0.653846
tpr                   0.34
trained                  Y
Name: 164, dtype: object
done
nb disabled
4
insult
245
{'C': 1.5}
C                     1.5
ROC              0.977268
fpr           

C                         5
ROC                0.979761
fpr              0.00208768
label          severe_toxic
naive-bayes               Y
precision          0.492308
tpr                0.213333
trained                   Y
Name: 96, dtype: object
done
nb enabled
24
threat
202
{'C': 6.0}
C                        6
ROC               0.987255
fpr            0.000440058
label               threat
naive-bayes              Y
precision         0.708333
tpr                   0.34
trained                  Y
Name: 202, dtype: object
done
nb disabled
25
toxic
3
{'C': 0.050000000000000003}
C                    0.05
ROC                0.9843
fpr            0.00978216
label               toxic
naive-bayes             N
precision        0.889844
tpr              0.738172
trained                 Y
Name: 3, dtype: object
done
nb enabled
26
threat
166
{'C': 0.23000000000000001}
C                     0.23
ROC               0.992843
fpr            0.000502923
label               threat
naive-bayes       

C                      0.1
ROC               0.993317
fpr            0.000565789
label               threat
naive-bayes              Y
precision             0.64
tpr                   0.32
trained                  Y
Name: 160, dtype: object
done
nb disabled
46
identity_hate
271
{'C': 0.23000000000000001}
C                       0.23
ROC                 0.985301
fpr               0.00132802
label          identity_hate
naive-bayes                N
precision           0.676923
tpr                 0.305556
trained                    Y
Name: 271, dtype: object
done
nb disabled
47
toxic
31
{'C': 0.69999999999999996}
C                     0.7
ROC              0.982896
fpr            0.00943527
label               toxic
naive-bayes             N
precision        0.891978
tpr              0.727803
trained                 Y
Name: 31, dtype: object
done
nb enabled
48
toxic
40
{'C': 3.0}
C                       3
ROC              0.978773
fpr            0.00846399
label               toxic
naive-

In [29]:
df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False)

In [31]:
toxic = df_svc[df_svc['label'] == 'toxic']
toxic.sort_values(['ROC'], ascending = False)

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
5,0.1,0.984602,0.009713,toxic,N,0.890881,0.740765,Y
7,0.15,0.984587,0.009643,toxic,N,0.891406,0.739469,Y
9,0.2,0.984452,0.009574,toxic,N,0.892356,0.741413,Y
11,0.23,0.98435,0.009782,toxic,N,0.890101,0.740117,Y
13,0.25,0.984285,0.009643,toxic,N,0.891321,0.73882,Y
15,0.27,0.984215,0.009643,toxic,N,0.891321,0.73882,Y
17,0.28,0.984183,0.009643,toxic,N,0.891236,0.738172,Y
19,0.29,0.984146,0.009643,toxic,N,0.891066,0.736876,Y
3,0.05,0.984143,0.009505,toxic,N,0.892041,0.733636,Y
21,0.3,0.984108,0.009643,toxic,N,0.890895,0.73558,Y


In [37]:


params = {}
for col in label_cols:
    dff = df_svc[df_svc['label'] == col]
    params[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']
params['threat'] = 0.01
    

In [38]:
params

{'identity_hate': 0.01,
 'insult': 0.050000000000000003,
 'obscene': 0.050000000000000003,
 'severe_toxic': 0.01,
 'threat': 0.01,
 'toxic': 0.10000000000000001}

In [39]:
#### based on ROC no naive bayes
#0.9814
# {'identity_hate': 0.01,
#  'insult': 0.050000000000000003,
#  'obscene': 0.050000000000000003,
#  'severe_toxic': 0.01,
#  'threat': 0.01,
#  'toxic': 0.10000000000000001}

pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf, train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf)[:,1]
pred_prob.to_csv(PATH + 'SVC_no_bayes_BEST_ROC.csv', index=False)

In [40]:
params_nb = {}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'Y']
    dff = dff[dff['label'] == col]
    params_nb[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']


In [41]:
params_nb

{'identity_hate': 0.050000000000000003,
 'insult': 0.050000000000000003,
 'obscene': 0.10000000000000001,
 'severe_toxic': 0.050000000000000003,
 'threat': 0.10000000000000001,
 'toxic': 0.14999999999999999}

In [43]:
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    print(col)
    lsvc = LinearSVC(C=params_nb[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf.multiply(r_dict[col]).tocsr(), train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf.multiply(r_dict[col]).tocsr())[:,1]
pred_prob.to_csv(PATH + 'SVC_bayes_BEST_ROC.csv', index=False)

toxic
severe_toxic
obscene
threat
insult
identity_hate
