In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from scipy.sparse import csr_matrix, hstack


In [2]:
PATH = '../data/'

# train = pd.read_csv(PATH + 'cleaned_train_emoji.csv')
# test = pd.read_csv(PATH + 'cleaned_test_emoji.csv')

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [3]:


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [4]:
# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}

In [5]:
train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


278

In [6]:

import gc
gc.collect()

0

In [None]:
cols = {'C': [], 'naive-bayes':[], 'label':[], 
        'precision':[], 'tpr':[], 'fpr':[], 
        'ROC':[],'trained':[]}
df_svc = pd.DataFrame(cols)
########################################

C = [0.01,0.02, 0.05,0.1,0.15,0.2,0.23,0.25,0.27,0.28,0.29,0.3,0.32,0.35,0.4,0.5,0.7,0.9,1,1.5,2,3,4,5,6,7,8]
NaiveB = ['Y', 'N']

index = 0
for col in label_cols:
    for cc in C:
        for nb in NaiveB:
            df_svc.loc[index, 'C'] = cc
            df_svc.loc[index, 'naive-bayes'] = nb
            df_svc.loc[index, 'label'] = col
            df_svc.loc[index, 'precision'] = 0
            df_svc.loc[index, 'tpr'] = 0
            df_svc.loc[index, 'fpr'] = 0
            df_svc.loc[index, 'ROC'] = 0
            df_svc.loc[index, 'trained'] = 'N'
            index += 1
                


df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji.csv', index=False)    

In [13]:
cols = {'C': [], 'naive-bayes':[], 'label':[], 
        'precision':[], 'tpr':[], 'fpr':[], 
        'ROC':[],'trained':[], 'fit_intercept': [], 'class_weight':[]}
df_svc = pd.DataFrame(cols)
########################################

C = [0.001,0.005,0.01,0.02,0.03, 0.05,0.1,0.15,0.2,0.23,0.25,0.27,0.28,0.29,0.3,0.32,0.35,0.4,0.5,0.7,0.9,1]
NaiveB = ['Y', 'N']
fit_intercept = ['False', 'True']
class_weight = ['None', 'balanced']

index = 0
for col in label_cols:
    for cc in C:
        for nb in NaiveB:
            for fi in fit_intercept:
                for cw in class_weight:
                    df_svc.loc[index, 'C'] = cc
                    df_svc.loc[index, 'naive-bayes'] = nb
                    df_svc.loc[index, 'label'] = col
                    df_svc.loc[index, 'precision'] = 0
                    df_svc.loc[index, 'tpr'] = 0
                    df_svc.loc[index, 'fpr'] = 0
                    df_svc.loc[index, 'ROC'] = 0
                    df_svc.loc[index, 'trained'] = 'N'
                    df_svc.loc[index, 'fit_intercept'] = fi
                    df_svc.loc[index, 'class_weight'] = cw
                    index += 1
                


df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji_0306.csv', index=False)                        

In [None]:
def get_parameters(df_svc, addition = None):
    params_list =['C']
    condition = (df_svc['trained'] == 'N')
    if addition is not None:
        for i in range(len(addition)):
            condition = condition &  (df_svc[addition[i]['key']] == addition[i]['value'])
#     df = df_lgb[(df_lgb['trained'] == 'N') & (df_lgb['label'] == col)]
    df = df_svc[condition]
    index = df.index.values
    np.random.shuffle(index)
    next_index = index[0]
    params = dict(df_svc.loc[next_index, params_list])
    ##### data type fix:
    return params, next_index

In [14]:
def get_parameters(df_svc, addition = None):
    params_list =['C', 'fit_intercept', 'class_weight']
    condition = (df_svc['trained'] == 'N')
    if addition is not None:
        for i in range(len(addition)):
            condition = condition &  (df_svc[addition[i]['key']] == addition[i]['value'])
#     df = df_lgb[(df_lgb['trained'] == 'N') & (df_lgb['label'] == col)]
    df = df_svc[condition]
    index = df.index.values
    np.random.shuffle(index)
    next_index = index[0]
    params = dict(df_svc.loc[next_index, params_list])
    ########
    params['fit_intercept'] = bool( params['fit_intercept'])
    if params['class_weight'] is 'balanced':
        params['class_weight'] = 'balanced'
    else:
        params['class_weight'] = None
    ##### data type fix:
    return params, next_index

In [None]:
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import roc_auc_score, confusion_matrix

################# random search start
%env JOBLIB_TEMP_FOLDER=/tmp
base_params = {}
start = 0
for i in range(1000):
        start += 1
        # create dataset for lightgbm
        params = base_params.copy()  
        params_get, index_get = get_parameters(df_svc, addition=None)
        col = df_svc.loc[index_get, 'label']
        y_val = y_val_df[col].values
        y_train = y_train_df[col].values
        ###### check naive bayes
        nb = df_svc.loc[index_get, 'naive-bayes']
        if nb is 'Y':
            svc_train = x_train
            svc_val = x_val
            print('nb enabled')
        else:
            svc_train = train_set[col]
            svc_val = val_set[col]
            print('nb disabled')
        params.update(params_get)
        print(start)
        print(col)
        print(index_get)
        print(params)
        lsvc = LinearSVC(**params_get)
        model_svc = CalibratedClassifierCV(lsvc) 
        model_svc.fit(svc_train, y_train)
        pred_prob = model_svc.predict_proba(svc_val)[:,1]
        pred = model_svc.predict(svc_val)
        roc = roc_auc_score(y_val,pred_prob)
        tn, fp, fn, tp = confusion_matrix(y_val, pred).ravel()
        precision = tp / (tp + fp)
        tpr = tp / (tp + fn)
        fpr = fp / (fp + tn)

        df_svc.loc[index_get, 'precision'] = precision
        df_svc.loc[index_get, 'tpr'] = tpr
        df_svc.loc[index_get, 'fpr'] = fpr
        df_svc.loc[index_get, 'ROC'] = roc
        df_svc.loc[index_get, 'trained'] = 'Y'
        print(df_svc.iloc[index_get])
        i += 1
        print('==================================================')
        df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji_0306.csv', index=False) 
        print('done')
        print('===================================')
    

env: JOBLIB_TEMP_FOLDER=/tmp
nb enabled
1
obscene
464
{'C': 0.29999999999999999, 'fit_intercept': True, 'class_weight': None}
C                       0.3
ROC                 0.99145
class_weight           None
fit_intercept         False
fpr              0.00602929
label               obscene
naive-bayes               Y
precision          0.877852
tpr                0.756944
trained                   Y
Name: 464, dtype: object
done
nb disabled
2
insult
740
{'C': 0.029999999999999999, 'fit_intercept': True, 'class_weight': None}
C                      0.03
ROC                0.984801
class_weight           None
fit_intercept         False
fpr              0.00733157
label                insult
naive-bayes               N
precision           0.81592
tpr                0.602203
trained                   Y
Name: 740, dtype: object
done
nb enabled
3
insult
715
{'C': 0.0050000000000000001, 'fit_intercept': True, 'class_weight': 'balanced'}
C                     0.005
ROC                  0.9

C                          1
ROC                 0.987035
class_weight        balanced
fit_intercept          False
fpr              0.000440058
label                 threat
naive-bayes                N
precision           0.730769
tpr                     0.38
trained                    Y
Name: 701, dtype: object
done
nb enabled
18
identity_hate
931
{'C': 0.10000000000000001, 'fit_intercept': True, 'class_weight': 'balanced'}
C                          0.1
ROC                   0.987656
class_weight          balanced
fit_intercept             True
fpr                 0.00164422
label            identity_hate
naive-bayes                  Y
precision             0.638889
tpr                   0.319444
trained                      Y
Name: 931, dtype: object
done
nb enabled
19
severe_toxic
339
{'C': 0.90000000000000002, 'fit_intercept': True, 'class_weight': 'balanced'}
C                         0.9
ROC                  0.984072
class_weight         balanced
fit_intercept            True
f

C                       0.5
ROC                0.981497
class_weight       balanced
fit_intercept          True
fpr              0.00922714
label                 toxic
naive-bayes               Y
precision          0.892828
tpr                0.718082
trained                   Y
Name: 147, dtype: object
done
nb disabled
35
identity_hate
1045
{'C': 0.90000000000000002, 'fit_intercept': True, 'class_weight': 'balanced'}
C                          0.9
ROC                   0.983939
class_weight          balanced
fit_intercept            False
fpr                 0.00151774
label            identity_hate
naive-bayes                  N
precision             0.647059
tpr                   0.305556
trained                      Y
Name: 1045, dtype: object
done
nb disabled
36
identity_hate
949
{'C': 0.20000000000000001, 'fit_intercept': True, 'class_weight': 'balanced'}
C                          0.2
ROC                    0.98626
class_weight          balanced
fit_intercept            False
fp

C                          0.5
ROC                   0.984775
class_weight          balanced
fit_intercept            False
fpr                 0.00158098
label            identity_hate
naive-bayes                  N
precision             0.642857
tpr                     0.3125
trained                      Y
Name: 1029, dtype: object
done
nb disabled
52
insult
804
{'C': 0.28000000000000003, 'fit_intercept': True, 'class_weight': None}
C                      0.28
ROC                0.981622
class_weight           None
fit_intercept         False
fpr              0.00733157
label                insult
naive-bayes               N
precision          0.813758
tpr                0.593635
trained                   Y
Name: 804, dtype: object
done
nb disabled
53
severe_toxic
340
{'C': 0.90000000000000002, 'fit_intercept': True, 'class_weight': None}
C                         0.9
ROC                  0.985773
class_weight             None
fit_intercept           False
fpr                 0.00170

C                        0.01
ROC                  0.989741
class_weight             None
fit_intercept           False
fpr                0.00246726
label            severe_toxic
naive-bayes                 N
precision                 0.5
tpr                      0.26
trained                     Y
Name: 196, dtype: object
done
nb disabled
68
severe_toxic
182
{'C': 0.001, 'fit_intercept': True, 'class_weight': None}
C                       0.001
ROC                  0.987501
class_weight             None
fit_intercept            True
fpr                  0.002404
label            severe_toxic
naive-bayes                 N
precision            0.512821
tpr                  0.266667
trained                     Y
Name: 182, dtype: object
done
nb enabled
69
threat
627
{'C': 0.28000000000000003, 'fit_intercept': True, 'class_weight': 'balanced'}
C                       0.28
ROC                 0.991993
class_weight        balanced
fit_intercept           True
fpr              0.000502923
la

C                        0.23
ROC                  0.987847
class_weight         balanced
fit_intercept           False
fpr                0.00278358
label            severe_toxic
naive-bayes                 Y
precision            0.482353
tpr                  0.273333
trained                     Y
Name: 249, dtype: object
done
nb enabled
85
insult
769
{'C': 0.20000000000000001, 'fit_intercept': True, 'class_weight': 'balanced'}
C                       0.2
ROC                0.982822
class_weight       balanced
fit_intercept         False
fpr              0.00819022
label                insult
naive-bayes               Y
precision          0.801917
tpr                0.614443
trained                   Y
Name: 769, dtype: object
done
nb disabled
86
insult
799
{'C': 0.27000000000000002, 'fit_intercept': True, 'class_weight': 'balanced'}
C                      0.27
ROC                0.982653
class_weight       balanced
fit_intercept          True
fpr              0.00819022
label        

C                     0.01
ROC               0.974189
class_weight      balanced
fit_intercept        False
fpr              0.0123491
label                toxic
naive-bayes              Y
precision         0.857143
tpr               0.692158
trained                  Y
Name: 17, dtype: object
done
nb disabled
102
obscene
478
{'C': 0.32000000000000001, 'fit_intercept': True, 'class_weight': None}
C                      0.32
ROC                0.992287
class_weight           None
fit_intercept          True
fpr              0.00563175
label               obscene
naive-bayes               N
precision          0.882271
tpr                0.737269
trained                   Y
Name: 478, dtype: object
done
nb disabled
103
toxic
22
{'C': 0.01, 'fit_intercept': True, 'class_weight': None}
C                     0.01
ROC               0.981391
class_weight          None
fit_intercept         True
fpr              0.0100597
label                toxic
naive-bayes              N
precision          0

C                       0.2
ROC                0.984429
class_weight       balanced
fit_intercept          True
fpr              0.00992091
label                 toxic
naive-bayes               N
precision          0.888368
tpr                0.737524
trained                   Y
Name: 71, dtype: object
done
nb disabled
119
threat
565
{'C': 0.029999999999999999, 'fit_intercept': True, 'class_weight': 'balanced'}
C                       0.03
ROC                 0.992543
class_weight        balanced
fit_intercept          False
fpr              0.000565789
label                 threat
naive-bayes                N
precision           0.678571
tpr                     0.38
trained                    Y
Name: 565, dtype: object
done
nb disabled
120
identity_hate
1022
{'C': 0.40000000000000002, 'fit_intercept': True, 'class_weight': None}
C                          0.4
ROC                   0.984352
class_weight              None
fit_intercept             True
fpr                 0.00126478
lab

In [53]:
df_svc.to_csv(PATH + 'svc_all_gridsearch_nonemoji_0306.csv', index=False)

In [56]:
for col in label_cols:
    print(df_svc[df_svc['label'] == col].sort_values(['ROC'], ascending = False).iloc[0])
    print('----------------------')

C                     0.1
ROC              0.984741
fpr            0.00978216
label               toxic
naive-bayes             N
precision        0.890272
tpr              0.741413
trained                 Y
Name: 5, dtype: object
----------------------
C                      0.01
ROC                0.989741
fpr              0.00246726
label          severe_toxic
naive-bayes               N
precision               0.5
tpr                    0.26
trained                   Y
Name: 53, dtype: object
----------------------
C                    0.05
ROC              0.993175
fpr            0.00536673
label             obscene
naive-bayes             N
precision        0.887344
tpr              0.738426
trained                 Y
Name: 107, dtype: object
----------------------
C                     0.01
ROC               0.993429
fpr            0.000502923
label               threat
naive-bayes              N
precision         0.652174
tpr                    0.3
trained                  Y
Nam

In [54]:
toxic = df_svc[df_svc['label'] == 'toxic']
toxic.sort_values(['ROC'], ascending = False)

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
5,0.1,0.984741,0.009782,toxic,N,0.890272,0.741413,Y
7,0.15,0.984702,0.009782,toxic,N,0.890272,0.741413,Y
9,0.2,0.984547,0.009574,toxic,N,0.892774,0.744653,Y
11,0.23,0.984433,0.009505,toxic,N,0.893136,0.742061,Y
13,0.25,0.98436,0.009366,toxic,N,0.894283,0.740117,Y
3,0.05,0.9843,0.009782,toxic,N,0.889844,0.738172,Y
15,0.27,0.984286,0.009366,toxic,N,0.893701,0.73558,Y
17,0.28,0.984246,0.009297,toxic,N,0.894322,0.734932,Y
19,0.29,0.984205,0.009297,toxic,N,0.894155,0.733636,Y
21,0.3,0.984172,0.009297,toxic,N,0.894155,0.733636,Y


In [58]:
###### naive bayes is N

params = {}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'N']
    dff = dff[dff['label'] == col]
    params[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']
    

In [None]:
###### naive bayes is N

params = {col:{} for col in label_cols}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'N']
    dff = dff[dff['label'] == col]
    best = dff.sort_values(['ROC'], ascending = False).iloc[0]
    params[col]['C'] = best['C']
    params[col]['fit_intercept'] = best['fit_intercept']
    params[col]['class_weight'] = best['class_weight']
    params[col]['fit_intercept'] = bool( params[col]['fit_intercept'])
    if params[col]['class_weight'] is 'balanced':
        params[col]['class_weight'] = 'balanced'
    else:
        params[col]['class_weight'] = None

In [64]:
params

{'identity_hate': 0.01,
 'insult': 0.050000000000000003,
 'obscene': 0.050000000000000003,
 'severe_toxic': 0.01,
 'threat': 0.01,
 'toxic': 0.10000000000000001}

In [60]:
#### based on ROC no naive bayes
#0.9814
# {'identity_hate': 0.01,
#  'insult': 0.050000000000000003,
#  'obscene': 0.050000000000000003,
#  'severe_toxic': 0.01,
#  'threat': 0.01,
#  'toxic': 0.10000000000000001}

pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    lsvc = LinearSVC(C=params[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf, train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf)[:,1]
pred_prob.to_csv(PATH + 'SVC_no_bayes_BEST_ROC_noemoji.csv', index=False)

In [61]:
params_nb = {}
for col in label_cols:
    dff = df_svc[df_svc['naive-bayes'] == 'Y']
    dff = dff[dff['label'] == col]
    params_nb[col] = dff.sort_values(['ROC'], ascending = False).iloc[0]['C']


In [65]:
params_nb

{'identity_hate': 0.10000000000000001,
 'insult': 0.050000000000000003,
 'obscene': 0.10000000000000001,
 'severe_toxic': 0.050000000000000003,
 'threat': 0.10000000000000001,
 'toxic': 0.14999999999999999}

In [63]:
pred_prob = pd.DataFrame()
pred_prob['id'] = test['id']
for col in label_cols:
    print(col)
    lsvc = LinearSVC(C=params_nb[col])
    model_svc = CalibratedClassifierCV(lsvc)
    model_svc.fit(train_tfidf.multiply(r_dict[col]).tocsr(), train[col].values)
    pred_prob[col] = model_svc.predict_proba(test_tfidf.multiply(r_dict[col]).tocsr())[:,1]
pred_prob.to_csv(PATH + 'SVC_bayes_BEST_ROC_noemoji.csv', index=False)

toxic
severe_toxic
obscene
threat
insult
identity_hate


In [52]:
df_svc

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
0,0.01,0.975405,0.011239,toxic,Y,0.868720,0.694750,Y
1,0.01,0.981391,0.010060,toxic,N,0.884370,0.718730,Y
2,0.05,0.982279,0.010268,toxic,Y,0.885004,0.738172,Y
3,0.05,0.984300,0.009782,toxic,N,0.889844,0.738172,Y
4,0.10,0.983090,0.009990,toxic,Y,0.887850,0.738820,Y
5,0.10,0.984741,0.009782,toxic,N,0.890272,0.741413,Y
6,0.15,0.983134,0.009574,toxic,Y,0.891934,0.738172,Y
7,0.15,0.984702,0.009782,toxic,N,0.890272,0.741413,Y
8,0.20,0.983015,0.009435,toxic,Y,0.892998,0.735580,Y
9,0.20,0.984547,0.009574,toxic,N,0.892774,0.744653,Y


In [69]:
df_svc = pd.read_csv(PATH + 'svc_all_gridsearch_nonemoji.csv')

In [78]:

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
dd = df_svc[df_svc['label'] == 'threat']

In [79]:
dd.sort_values(['ROC'], ascending = False)

Unnamed: 0,C,ROC,fpr,label,naive-bayes,precision,tpr,trained
157,0.01,0.993429,0.000503,threat,N,0.652174,0.3,Y
160,0.1,0.993317,0.000566,threat,Y,0.64,0.32,Y
162,0.15,0.993199,0.000566,threat,Y,0.653846,0.34,Y
164,0.2,0.992964,0.000566,threat,Y,0.653846,0.34,Y
158,0.05,0.99294,0.000629,threat,Y,0.615385,0.32,Y
166,0.23,0.992843,0.000503,threat,Y,0.68,0.34,Y
168,0.25,0.992733,0.000503,threat,Y,0.68,0.34,Y
170,0.27,0.992659,0.000566,threat,Y,0.653846,0.34,Y
172,0.28,0.992598,0.000566,threat,Y,0.666667,0.36,Y
174,0.29,0.992552,0.000566,threat,Y,0.666667,0.36,Y


In [76]:
count = {}
for col in label_cols:
    count[col] = train[col].values.sum()

In [77]:
count

{'identity_hate': 1405,
 'insult': 7877,
 'obscene': 8449,
 'severe_toxic': 1595,
 'threat': 478,
 'toxic': 15294}

In [5]:
df_svc_one = pd.read_csv(PATH + 'OneVsOne_svm_parameters.csv')

In [8]:
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
ddd = df_svc_one[df_svc_one['label'] == 'toxic']
ddd.sort_values(['ROC'], ascending = False)

Unnamed: 0,C,ROC,fpr,label,precision,tpr
9,0.29,0.984275,0.052657,toxic,0.651994,0.921581
11,0.32,0.984274,0.052588,toxic,0.652134,0.920933
10,0.3,0.984273,0.052796,toxic,0.651397,0.921581
8,0.28,0.984271,0.052865,toxic,0.651259,0.922229
12,0.35,0.984271,0.052518,toxic,0.652593,0.921581
7,0.27,0.984266,0.052865,toxic,0.651259,0.922229
6,0.25,0.984257,0.052657,toxic,0.651675,0.920285
13,0.4,0.984257,0.052518,toxic,0.652593,0.921581
5,0.23,0.98423,0.052935,toxic,0.65016,0.918989
14,0.5,0.984207,0.053004,toxic,0.650663,0.922229


In [82]:
ddd

Unnamed: 0,C,ROC,fpr,label,precision,tpr
