In [44]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack


In [45]:
PATH = '../data/'

# train = pd.read_csv(PATH + 'cleaned_train_emoji.csv')
# test = pd.read_csv(PATH + 'cleaned_test_emoji.csv')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [46]:


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [47]:
# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}

In [48]:
train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


4535

In [6]:

import gc
gc.collect()

0

In [49]:
cols = {'C': [], 'naive-bayes':[], 'label':[], 
        'precision':[], 'tpr':[], 'fpr':[], 
        'ROC':[],'trained':[]}
df_svc = pd.DataFrame(cols)
########################################

C = [0.01,0.05,0.1,0.15,0.2,0.23,0.25,0.27,0.28,0.29,0.3,0.32,0.35,0.4,0.5,0.7,0.9,1,1.5,2,3,4,5,6,7,8]
NaiveB = ['Y', 'N']

index = 0
for col in label_cols:
    for cc in C:
        for nb in NaiveB:
            df_svc.loc[index, 'C'] = cc
            df_svc.loc[index, 'naive-bayes'] = nb
            df_svc.loc[index, 'label'] = col
            df_svc.loc[index, 'precision'] = 0
            df_svc.loc[index, 'tpr'] = 0
            df_svc.loc[index, 'fpr'] = 0
            df_svc.loc[index, 'ROC'] = 0
            df_svc.loc[index, 'trained'] = 'N'
            index += 1
                


df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False)                        

In [None]:
def get_parameters(df_svc, addition = None):
    params_list =['C']
    condition = (df_svc['trained'] == 'N')
    if addition is not None:
        for i in range(len(addition)):
            condition = condition &  (df_svc[addition[i]['key']] == addition[i]['value'])
#     df = df_lgb[(df_lgb['trained'] == 'N') & (df_lgb['label'] == col)]
    df = df_svc[condition]
    index = df.index.values
    np.random.shuffle(index)
    next_index = index[0]
    params = dict(df_svc.loc[next_index, params_list])
    ##### data type fix:
    return params, next_index

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, confusion_matrix

################# random search start
%env JOBLIB_TEMP_FOLDER=/tmp
base_params = {}
start = 0
for i in range(1000):
        start += 1
        # create dataset for lightgbm
        params = base_params.copy()  
        params_get, index_get = get_parameters(df_svc, addition=None)
        col = df_svc.loc[index_get, 'label']
        y_val = y_val_df[col].values
        y_train = y_train_df[col].values
        ###### check naive bayes
        nb = df_svc.loc[index_get, 'naive-bayes']
        if nb is 'Y':
            svc_train = x_train
            svc_val = x_val
            print('nb enabled')
        else:
            svc_train = train_set[col]
            svc_val = val_set[col]
            print('nb disabled')
        params.update(params_get)
        print(start)
        print(col)
        print(index_get)
        print(params)
        lsvc = LinearSVC(**params_get)
        model_svc = CalibratedClassifierCV(lsvc) 
        model_svc.fit(svc_train, y_train)
        pred_prob = model_svc.predict_proba(svc_val)[:,1]
        pred = model_svc.predict(svc_val)
        roc = roc_auc_score(y_val,pred_prob)
        tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
        precision = tp / (tp + fp)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)

        df_svc.loc[index_get, 'precision'] = precision
        df_svc.loc[index_get, 'tpr'] = tpr
        df_svc.loc[index_get, 'fpr'] = fpr
        df_svc.loc[index_get, 'ROC'] = roc
        df_svc.loc[index_get, 'trained'] = 'Y'
        print(df_svc.iloc[index_get])
        i += 1
        print('==================================================')
        df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False) 
        print('done')
        print('===================================')
    

env: JOBLIB_TEMP_FOLDER=/tmp
nb enabled
1
threat
170
{'C': 0.27000000000000002}
C                     0.27
ROC               0.992659
fpr            0.000565789
label               threat
naive-bayes              Y
precision         0.653846
tpr                   0.34
trained                  Y
Name: 170, dtype: object
done
nb enabled
2
obscene
104
{'C': 0.01}
C                    0.01
ROC               0.98775
fpr            0.00563175
label             obscene
naive-bayes             Y
precision        0.879774
tpr              0.719907
trained                 Y
Name: 104, dtype: object
done
nb enabled
3
threat
164
{'C': 0.20000000000000001}
C                      0.2
ROC               0.992964
fpr            0.000565789
label               threat
naive-bayes              Y
precision         0.653846
tpr                   0.34
trained                  Y
Name: 164, dtype: object
done
nb disabled
4
insult
245
{'C': 1.5}
C                     1.5
ROC              0.977268
fpr           

C                         5
ROC                0.979761
fpr              0.00208768
label          severe_toxic
naive-bayes               Y
precision          0.492308
tpr                0.213333
trained                   Y
Name: 96, dtype: object
done
nb enabled
24
threat
202
{'C': 6.0}
C                        6
ROC               0.987255
fpr            0.000440058
label               threat
naive-bayes              Y
precision         0.708333
tpr                   0.34
trained                  Y
Name: 202, dtype: object
done
nb disabled
25
toxic
3
{'C': 0.050000000000000003}
C                    0.05
ROC                0.9843
fpr            0.00978216
label               toxic
naive-bayes             N
precision        0.889844
tpr              0.738172
trained                 Y
Name: 3, dtype: object
done
nb enabled
26
threat
166
{'C': 0.23000000000000001}
C                     0.23
ROC               0.992843
fpr            0.000502923
label               threat
naive-bayes       

C                      0.1
ROC               0.993317
fpr            0.000565789
label               threat
naive-bayes              Y
precision             0.64
tpr                   0.32
trained                  Y
Name: 160, dtype: object
done
nb disabled
46
identity_hate
271
{'C': 0.23000000000000001}
C                       0.23
ROC                 0.985301
fpr               0.00132802
label          identity_hate
naive-bayes                N
precision           0.676923
tpr                 0.305556
trained                    Y
Name: 271, dtype: object
done
nb disabled
47
toxic
31
{'C': 0.69999999999999996}
C                     0.7
ROC              0.982896
fpr            0.00943527
label               toxic
naive-bayes             N
precision        0.891978
tpr              0.727803
trained                 Y
Name: 31, dtype: object
done
nb enabled
48
toxic
40
{'C': 3.0}
C                       3
ROC              0.978773
fpr            0.00846399
label               toxic
naive-

C                       0.01
ROC                 0.987885
fpr               0.00132802
label          identity_hate
naive-bayes                N
precision           0.691176
tpr                 0.326389
trained                    Y
Name: 261, dtype: object
done
nb enabled
68
obscene
108
{'C': 0.10000000000000001}
C                     0.1
ROC              0.992092
fpr            0.00609554
label             obscene
naive-bayes             Y
precision        0.876344
tpr               0.75463
trained                 Y
Name: 108, dtype: object
done
nb disabled
69
threat
203
{'C': 6.0}
C                        6
ROC               0.980974
fpr            0.000377192
label               threat
naive-bayes              N
precision          0.73913
tpr                   0.34
trained                  Y
Name: 203, dtype: object
done
nb enabled
70
severe_toxic
62
{'C': 0.23000000000000001}
C                      0.23
ROC                0.987372
fpr              0.00265705
label          severe_t

C                   0.01
ROC             0.981391
fpr            0.0100597
label              toxic
naive-bayes            N
precision        0.88437
tpr              0.71873
trained                Y
Name: 1, dtype: object
done
nb disabled
90
severe_toxic
93
{'C': 3.0}
C                         3
ROC                0.985143
fpr              0.00164484
label          severe_toxic
naive-bayes               N
precision              0.48
tpr                    0.16
trained                   Y
Name: 93, dtype: object
done
nb enabled
91
identity_hate
308
{'C': 7.0}
C                          7
ROC                 0.978737
fpr               0.00107506
label          identity_hate
naive-bayes                Y
precision           0.685185
tpr                 0.256944
trained                    Y
Name: 308, dtype: object
done
nb disabled
92
threat
201
{'C': 5.0}
C                        5
ROC               0.981855
fpr            0.000377192
label               threat
naive-bayes              N


C                       7
ROC               0.98983
fpr            0.00569801
label             obscene
naive-bayes             N
precision        0.876967
tpr              0.709491
trained                 Y
Name: 153, dtype: object
done
nb disabled
112
severe_toxic
63
{'C': 0.23000000000000001}
C                      0.23
ROC                0.987418
fpr              0.00189789
label          severe_toxic
naive-bayes               N
precision          0.577465
tpr                0.273333
trained                   Y
Name: 63, dtype: object
done
nb disabled
113
identity_hate
287
{'C': 0.40000000000000002}
C                        0.4
ROC                 0.984352
fpr               0.00126478
label          identity_hate
naive-bayes                N
precision            0.68254
tpr                 0.298611
trained                    Y
Name: 287, dtype: object
done
nb enabled
114
threat
178
{'C': 0.32000000000000001}
C                     0.32
ROC               0.992416
fpr            0.000

C                     0.3
ROC              0.984172
fpr            0.00929652
label               toxic
naive-bayes             N
precision        0.894155
tpr              0.733636
trained                 Y
Name: 21, dtype: object
done
nb disabled
134
obscene
149
{'C': 5.0}
C                       5
ROC              0.989955
fpr            0.00569801
label             obscene
naive-bayes             N
precision        0.877318
tpr              0.711806
trained                 Y
Name: 149, dtype: object
done
nb enabled
135
threat
184
{'C': 0.5}
C                      0.5
ROC               0.991488
fpr            0.000565789
label               threat
naive-bayes              Y
precision         0.678571
tpr                   0.38
trained                  Y
Name: 184, dtype: object
done
nb enabled
136
obscene
136
{'C': 0.90000000000000002}
C                     0.9
ROC              0.989852
fpr            0.00583052
label             obscene
naive-bayes             Y
precision        0.

C                    0.28
ROC              0.984246
fpr            0.00929652
label               toxic
naive-bayes             N
precision        0.894322
tpr              0.734932
trained                 Y
Name: 17, dtype: object
done
nb enabled
156
severe_toxic
76
{'C': 0.34999999999999998}
C                      0.35
ROC                0.986191
fpr              0.00259379
label          severe_toxic
naive-bayes               Y
precision          0.517647
tpr                0.293333
trained                   Y
Name: 76, dtype: object
done
nb disabled
157
toxic
45
{'C': 5.0}
C                       5
ROC              0.979944
fpr            0.00915776
label               toxic
naive-bayes             N
precision        0.891892
tpr              0.705768
trained                 Y
Name: 45, dtype: object
done
nb enabled
158
insult
236
{'C': 0.5}
C                     0.5
ROC              0.979508
fpr            0.00766182
label              insult
naive-bayes             Y
precision   

C                    0.35
ROC              0.983978
fpr            0.00950465
label               toxic
naive-bayes             N
precision        0.891956
tpr              0.732988
trained                 Y
Name: 25, dtype: object
done
nb enabled
178
obscene
150
{'C': 6.0}
C                       6
ROC              0.987609
fpr            0.00530047
label             obscene
naive-bayes             Y
precision        0.885877
tpr               0.71875
trained                 Y
Name: 150, dtype: object
done
nb disabled
179
severe_toxic
85
{'C': 0.90000000000000002}
C                       0.9
ROC                0.985773
fpr               0.0017081
label          severe_toxic
naive-bayes               N
precision          0.534483
tpr                0.206667
trained                   Y
Name: 85, dtype: object
done
nb enabled
180
identity_hate
298
{'C': 2.0}
C                          2
ROC                 0.980834
fpr               0.00120154
label          identity_hate
naive-bayes    

C                       4
ROC              0.990048
fpr            0.00576426
label             obscene
naive-bayes             N
precision        0.876245
tpr              0.712963
trained                 Y
Name: 147, dtype: object
done
nb disabled
200
insult
243
{'C': 1.0}
C                       1
ROC              0.978225
fpr            0.00700132
label              insult
naive-bayes             N
precision        0.813708
tpr              0.566707
trained                 Y
Name: 243, dtype: object
done
nb disabled
201
threat
181
{'C': 0.34999999999999998}
C                     0.35
ROC               0.989498
fpr            0.000440058
label               threat
naive-bayes              N
precision         0.730769
tpr                   0.38
trained                  Y
Name: 181, dtype: object
done
nb disabled
202
identity_hate
275
{'C': 0.27000000000000002}
C                       0.27
ROC                 0.985015
fpr               0.00126478
label          identity_hate
naive-bay

In [53]:
df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False)

In [56]:
for col in label_cols:
    print(df_svc[df_svc['label'] == col].sort_values(['ROC'], ascending = False).iloc[0])
    print('----------------------')

C                     0.1
ROC              0.984741
fpr            0.00978216
label               toxic
naive-bayes             N
precision        0.890272
tpr              0.741413
trained                 Y
Name: 5, dtype: object
----------------------
C                      0.01
ROC                0.989741
fpr              0.00246726
label          severe_toxic
naive-bayes               N
precision               0.5
tpr                    0.26
trained                   Y
Name: 53, dtype: object
----------------------
C                    0.05
ROC              0.993175
fpr            0.00536673
label             obscene
naive-bayes             N
precision        0.887344
tpr              0.738426
trained                 Y
Name: 107, dtype: object
----------------------
C                     0.01
ROC               0.993429
fpr            0.000502923
label               threat
naive-bayes              N
precision         0.652174
tpr                    0.3
trained                  Y
Nam

In [54]:
toxic = df_svc[df_svc['label'] == 'toxic']
toxic.sort_values(['ROC'], ascending = False)

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
5,0.1,0.984741,0.009782,toxic,N,0.890272,0.741413,Y
7,0.15,0.984702,0.009782,toxic,N,0.890272,0.741413,Y
9,0.2,0.984547,0.009574,toxic,N,0.892774,0.744653,Y
11,0.23,0.984433,0.009505,toxic,N,0.893136,0.742061,Y
13,0.25,0.98436,0.009366,toxic,N,0.894283,0.740117,Y
3,0.05,0.9843,0.009782,toxic,N,0.889844,0.738172,Y
15,0.27,0.984286,0.009366,toxic,N,0.893701,0.73558,Y
17,0.28,0.984246,0.009297,toxic,N,0.894322,0.734932,Y
19,0.29,0.984205,0.009297,toxic,N,0.894155,0.733636,Y
21,0.3,0.984172,0.009297,toxic,N,0.894155,0.733636,Y


In [58]:
###### naive bayes is N

params = {}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'N']
    dff = dff[dff['label'] == col]
    params[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']
    

In [64]:
params

{'identity_hate': 0.01,
 'insult': 0.050000000000000003,
 'obscene': 0.050000000000000003,
 'severe_toxic': 0.01,
 'threat': 0.01,
 'toxic': 0.10000000000000001}

In [60]:
#### based on ROC no naive bayes
#0.9814
# {'identity_hate': 0.01,
#  'insult': 0.050000000000000003,
#  'obscene': 0.050000000000000003,
#  'severe_toxic': 0.01,
#  'threat': 0.01,
#  'toxic': 0.10000000000000001}

pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf, train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf)[:,1]
pred_prob.to_csv(PATH + 'SVC_no_bayes_BEST_ROC_noemoji.csv', index=False)

In [61]:
params_nb = {}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'Y']
    dff = dff[dff['label'] == col]
    params_nb[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']


In [65]:
params_nb

{'identity_hate': 0.10000000000000001,
 'insult': 0.050000000000000003,
 'obscene': 0.10000000000000001,
 'severe_toxic': 0.050000000000000003,
 'threat': 0.10000000000000001,
 'toxic': 0.14999999999999999}

In [63]:
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    print(col)
    lsvc = LinearSVC(C=params_nb[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf.multiply(r_dict[col]).tocsr(), train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf.multiply(r_dict[col]).tocsr())[:,1]
pred_prob.to_csv(PATH + 'SVC_bayes_BEST_ROC_noemoji.csv', index=False)

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [52]:
df_svc

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
0,0.01,0.975405,0.011239,toxic,Y,0.868720,0.694750,Y
1,0.01,0.981391,0.010060,toxic,N,0.884370,0.718730,Y
2,0.05,0.982279,0.010268,toxic,Y,0.885004,0.738172,Y
3,0.05,0.984300,0.009782,toxic,N,0.889844,0.738172,Y
4,0.10,0.983090,0.009990,toxic,Y,0.887850,0.738820,Y
5,0.10,0.984741,0.009782,toxic,N,0.890272,0.741413,Y
6,0.15,0.983134,0.009574,toxic,Y,0.891934,0.738172,Y
7,0.15,0.984702,0.009782,toxic,N,0.890272,0.741413,Y
8,0.20,0.983015,0.009435,toxic,Y,0.892998,0.735580,Y
9,0.20,0.984547,0.009574,toxic,N,0.892774,0.744653,Y
