In [1]:
import pandas as pd 
import numpy as np 
import re 
from nltk.corpus import stopwords 
from nltk.tokenize import TweetTokenizer 
from nltk.stem.wordnet import WordNetLemmatizer 
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer 
from sklearn.linear_model import LogisticRegression 
from sklearn.svm import SVC 
from sklearn.svm import LinearSVC 
from sklearn.calibration import CalibratedClassifierCV 
from sklearn.metrics import roc_auc_score 
from scipy.sparse import csr_matrix, hstack 
import lightgbm as lgb

In [2]:
PATH = '../data/'

train = pd.read_csv(PATH + 'cleaned_train.csv')
test = pd.read_csv(PATH + 'cleaned_test.csv')


train_sentence = train['comment_text_cleaned_polarity']
test_sentence = test['comment_text_cleaned_polarity']


train_sentence_retain_punctuation = train['comment_text_cleaned_retain_punctuation']
test_sentence_retain_punctuation = test['comment_text_cleaned_retain_punctuation']

text = train_sentence

text_retain_punctuation = train_sentence_retain_punctuation


print(train.shape)
print(test.shape)

(159571, 30)
(153164, 24)


In [None]:


phrase_vectorizer = TfidfVectorizer(ngram_range=(1,3),
                                    strip_accents='unicode', 
                                    max_features=100000, 
                                    analyzer='word',
                                    sublinear_tf=True,
                                    token_pattern=r'\w{1,}')
char_vectorizer = TfidfVectorizer(ngram_range=(2,5), 
                                  strip_accents='unicode', 
                                  max_features=200000, 
                                  analyzer='char', 
                                  sublinear_tf=True)

print('fitting char')
char_vectorizer.fit(text_retain_punctuation.values)
print('fitting phrase')
phrase_vectorizer.fit(text.values)

print('transforming train skip gram')

print('transforming train char')
train_char = char_vectorizer.transform(train_sentence_retain_punctuation.values)
print('transforming train phrase')
train_phrase = phrase_vectorizer.transform(train_sentence.values)


print('transforming test char')
test_char = char_vectorizer.transform(test_sentence_retain_punctuation.values)
print('transforming test phrase')
test_phrase = phrase_vectorizer.transform(test_sentence.values)


train_tfidf = hstack((train_char, train_phrase), format='csr')
test_tfidf = hstack((test_char, test_phrase), format='csr')

label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

train_tfidf

fitting char
fitting phrase
transforming train skip gram
transforming train char
transforming train phrase
transforming test char
transforming test phrase


<159571x300000 sparse matrix of type '<class 'numpy.float64'>'
	with 162813546 stored elements in Compressed Sparse Row format>

In [None]:
# from sklearn.model_selection import train_test_split
# x_train, x_val, y_train_df, y_val_df = train_test_split(train_tfidf, train, test_size=0.33)
# # Split the dataset



# Split the dataset
split_index = round(len(train) * 0.9) #################################
# shuffled_train = train#.sample(frac=1)
x_train = train_tfidf[:split_index]
y_train_df = train.iloc[:split_index]
#######
x_val = train_tfidf[split_index:]
y_val_df = train.iloc[split_index:]
# Get test data ready
x_test = test_tfidf


# train toxic
def pr(y_i, y, train_features):
    p = train_features[y==y_i].sum(0)
    return (p + 1) / ((y == y_i).sum() + 1)
r_dict = {label: np.log(pr(1, y_train_df[label].values, x_train) / pr(0,  y_train_df[label].values, x_train)) for label in label_cols}

In [None]:
train_set = {label: x_train.multiply(r_dict[label]).tocsr() for label in r_dict }
val_set = {label: x_val.multiply(r_dict[label]).tocsr() for label in r_dict }
test_set = {label: x_test.multiply(r_dict[label]).tocsr() for label in r_dict }

# del r_dict, x_train, x_val
import gc
gc.collect()


278

In [None]:
del r_dict, x_train, x_val
import gc
gc.collect()

0

In [None]:
for 
lgb_train_set = lgb.Dataset(train_set['toxic'], y_train_df['toxic'].values)
lgb_eval_set = lgb.Dataset(val_set['toxic'], y_val_df['toxic'].values, reference=lgb_train_set)


# params = {
#     'task': 'train',
#     'boosting_type': 'gbdt',
#     'objective': 'binary',
#     'metric': {'binary', 'auc'},
#     'learning_rate': 0.2,
#     'num_iterations': 100,
#     'num_leaves': 171,
#     'device': 'cpu',
#     'num_threads': 24,
#     'max_depth': -1,
# #     'min_data_in_leaf': 5,
#     'bagging_fraction': 0.8,
#     'bagging_freq': 1,
#     'feature_fraction': 0.8,
#     'lambda_l1': 1,
#     'lambda_l2': 1}


params = {
    'learning_rate': 0.2,
    'application': 'binary',
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'nthread': 4,
    'lambda_l1': 1,
    'lambda_l2': 1
} 

gbm = lgb.train(params,
                lgb_train_set,
                valid_sets=lgb_eval_set,
               verbose_eval=10)



In [None]:
from sklearn.metrics import roc_auc_score
y = y_val_df['toxic']
pred = gbm.predict(val_set['toxic'])
print('accuracy is {}'.format(roc_auc_score(y,pred)))

In [None]:
inv_char = {v: k for k, v in char_vectorizer.vocabulary_.items()}
inv_phrase = {v: k for k, v in phrase_vectorizer.vocabulary_.items()}
features = []
features_char = [inv_char[i] for i in range(0,train_char.shape[1] )]
features_phrase = [inv_phrase[i] for i in range(0,train_phrase.shape[1])]
features.extend(features_char)
features.extend(features_phrase)

In [None]:
df_importance = pd.DataFrame()
df_importance['score'] = pd.Series(gbm.feature_importance()) 
df_importance['feature'] = pd.Series(features)

In [None]:
df_importance.sort_values(by=['score'], ascending=False)

In [None]:
# cols = {'combination_id': [], 'learning_rate':[], 'num_leaves':[], 
#         'bagging_fraction':[], 'feature_fraction':[], 'lambda_l1':[], 
#         'lambda_l2':[], 'label':[],'roc':[],'trained':[], 'max_depth': [], 'num_iterations': []}
# df_lgb = pd.DataFrame(cols)
# learning_rate = [ 0.2, 0.1, 0.05]
# num_leaves = [11,31, 61,]
# max_depth = [3,4,5]
# bag_frt = [0.9, 0.8,0.7]
# feature_fraction = [1, 0.8,0.6]
# lambda_l1 = [1, 0]
# lambda_l2 = [1, 0]
# num_iterations = [300, 1200]

# com_id = 0
# index = 0
# for col in label_cols:
#     com_id += 1
#     for lr in learning_rate:
#         for nl in num_leaves:
#             for md in max_depth:
#                 for bf in bag_frt:
#                     for ff in feature_fraction:
#                         for l1 in lambda_l1:
#                             for l2 in lambda_l2:
#                                 for ni in num_iterations:
#                                     df_lgb.loc[index, 'num_iterations'] = ni
#                                     df_lgb.loc[index, 'lambda_l2'] = l2
#                                     df_lgb.loc[index, 'lambda_l1'] = l1
#                                     df_lgb.loc[index, 'feature_fraction'] = ff
#                                     df_lgb.loc[index, 'bagging_fraction'] = bf
#                                     df_lgb.loc[index, 'max_depth'] = md
#                                     df_lgb.loc[index, 'num_leaves'] = nl
#                                     df_lgb.loc[index, 'learning_rate'] = lr
#                                     df_lgb.loc[index, 'label'] = col
#                                     df_lgb.loc[index, 'roc'] = 0
#                                     df_lgb.loc[index, 'trained'] = 'N'
#                                     df_lgb.loc[index, 'combination_id'] = com_id
#                                     index += 1
# df_lgb.to_csv(PATH + 'lgb_all_gridsearch.csv', index=False)                        

In [None]:
df_lgb = pd.read_csv(PATH + 'lgb_all_gridsearch.csv')

In [None]:
df_lgb

In [25]:
base_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary', 'auc'},
    'verbosity': -1,
    'metric': 'auc',
    'bagging_freq': 1,
    'num_threads': 8,
    'early_stopping_round':20
    } 
def get_parameters(df_lgb , col, addition = None):
    params_list =['bagging_fraction',  
                     'feature_fraction', 
                     'lambda_l1', 
                     'lambda_l2', 
                     'learning_rate', 
                     'max_depth',
                     'num_iterations', 
                     'num_leaves']
    condition = (df_lgb['trained'] == 'N') & (df_lgb['label'] == col)
    if addition is not None:
        for i in range(len(addition)):
            condition = condition &  (df_lgb[addition[i]['key']] == addition[i]['value'])
#     df = df_lgb[(df_lgb['trained'] == 'N') & (df_lgb['label'] == col)]
    df = df_lgb[condition]
    index = df.index.values
    np.random.shuffle(index)
    next_index = index[0]
    params = dict(df_lgb.loc[next_index, params_list])
    ##### data type fix:
    params['num_leaves'] = int(round(params['num_leaves']))
    params['max_depth'] = int(round(params['max_depth']))
    params['num_iterations'] = int(round(params['num_iterations']))
    return params, next_index
    
    
    

In [None]:
################# random search start
%env JOBLIB_TEMP_FOLDER=/tmp
start = 0
for i in range(1000):
    for col in label_cols:
        start += 1
        # create dataset for lightgbm
        params =  base_params.copy()  
        add = [{'key':'num_iterations', 'value': 1200}]
        params_get, index_get = get_parameters(df_lgb, col, addition=add)
        params.update(params_get)
        print(start)
        print(col)
        print(index_get)
        print(params)
        lgb_train_set = lgb.Dataset(train_set[col], y_train_df[col].values)
        lgb_eval_set = lgb.Dataset(val_set[col], y_val_df[col].values, reference=lgb_train_set)


        gbm_model = lgb.train(params,
                            lgb_train_set,
                            valid_sets=lgb_eval_set,
                           verbose_eval=20)
        pred_prob =  gbm_model.predict(val_set[col])
        roc = roc_auc_score(y_val_df[col].values,pred_prob)
        df_lgb.loc[index_get, 'trained'] = 'Y'
        df_lgb.loc[index_get, 'roc'] = roc
        print('ROC score is {}'.format(roc) )
        df_lgb.to_csv(PATH + 'lgb_all_gridsearch.csv', index=False) 
        print('done')
        print('===================================')
    

env: JOBLIB_TEMP_FOLDER=/tmp
1
toxic
1933
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'bagging_freq': 1, 'num_threads': 20, 'early_stopping_round': 20, 'bagging_fraction': 0.69999999999999996, 'feature_fraction': 0.80000000000000004, 'lambda_l1': 0.0, 'lambda_l2': 1.0, 'learning_rate': 0.050000000000000003, 'max_depth': 5, 'num_iterations': 1200, 'num_leaves': 61}




Training until validation scores don't improve for 20 rounds.
[20]	valid_0's auc: 0.860329
[40]	valid_0's auc: 0.918368
[60]	valid_0's auc: 0.93135
[80]	valid_0's auc: 0.940417
[100]	valid_0's auc: 0.947719
[120]	valid_0's auc: 0.952821
[140]	valid_0's auc: 0.956403
[160]	valid_0's auc: 0.960076
[180]	valid_0's auc: 0.962101
[200]	valid_0's auc: 0.964338
[220]	valid_0's auc: 0.965576
[240]	valid_0's auc: 0.966748
[260]	valid_0's auc: 0.968008
[280]	valid_0's auc: 0.968815
[300]	valid_0's auc: 0.969604
[320]	valid_0's auc: 0.970426
[340]	valid_0's auc: 0.971241
[360]	valid_0's auc: 0.971895
[380]	valid_0's auc: 0.972297
[400]	valid_0's auc: 0.972875
[420]	valid_0's auc: 0.97321
[440]	valid_0's auc: 0.973619
[460]	valid_0's auc: 0.974058
[480]	valid_0's auc: 0.974523
[500]	valid_0's auc: 0.974809
[520]	valid_0's auc: 0.975127
[540]	valid_0's auc: 0.975444
[560]	valid_0's auc: 0.975719
[580]	valid_0's auc: 0.976032
[600]	valid_0's auc: 0.976351
[620]	valid_0's auc: 0.976472
[640]	valid_0'

[1020]	valid_0's auc: 0.976752
[1040]	valid_0's auc: 0.976853
[1060]	valid_0's auc: 0.977016
[1080]	valid_0's auc: 0.977131
[1100]	valid_0's auc: 0.977326
[1120]	valid_0's auc: 0.977512
[1140]	valid_0's auc: 0.977613
[1160]	valid_0's auc: 0.977719
[1180]	valid_0's auc: 0.977806
[1200]	valid_0's auc: 0.977912
Did not meet early stopping. Best iteration is:
[1200]	valid_0's auc: 0.977912
ROC score is 0.9779120150433425
8
severe_toxic
3569
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'bagging_freq': 1, 'num_threads': 20, 'early_stopping_round': 20, 'bagging_fraction': 0.80000000000000004, 'feature_fraction': 0.59999999999999998, 'lambda_l1': 1.0, 'lambda_l2': 1.0, 'learning_rate': 0.050000000000000003, 'max_depth': 4, 'num_iterations': 1200, 'num_leaves': 31}
Training until validation scores don't improve for 20 rounds.
[20]	valid_0's auc: 0.9283
[40]	valid_0's auc: 0.951058
[60]	valid_0's auc: 0.952703
[80]	valid_0's auc: 0.978741
[1

Training until validation scores don't improve for 20 rounds.
[20]	valid_0's auc: 0.976096
[40]	valid_0's auc: 0.987275
[60]	valid_0's auc: 0.988964
[80]	valid_0's auc: 0.988742
Early stopping, best iteration is:
[60]	valid_0's auc: 0.988964
ROC score is 0.9889643316376736
16
threat
7557
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'bagging_freq': 1, 'num_threads': 20, 'early_stopping_round': 20, 'bagging_fraction': 0.69999999999999996, 'feature_fraction': 0.59999999999999998, 'lambda_l1': 0.0, 'lambda_l2': 1.0, 'learning_rate': 0.050000000000000003, 'max_depth': 5, 'num_iterations': 1200, 'num_leaves': 31}
Training until validation scores don't improve for 20 rounds.
[20]	valid_0's auc: 0.766803
[40]	valid_0's auc: 0.879855
[60]	valid_0's auc: 0.948859
[80]	valid_0's auc: 0.967052
[100]	valid_0's auc: 0.986752
[120]	valid_0's auc: 0.986509
Early stopping, best iteration is:
[106]	valid_0's auc: 0.987631
ROC score is 0.987630602879

[200]	valid_0's auc: 0.974701
[220]	valid_0's auc: 0.975622
[240]	valid_0's auc: 0.97634
[260]	valid_0's auc: 0.976784
[280]	valid_0's auc: 0.9772
[300]	valid_0's auc: 0.977485
[320]	valid_0's auc: 0.977844
[340]	valid_0's auc: 0.977856
[360]	valid_0's auc: 0.978038
[380]	valid_0's auc: 0.978218
[400]	valid_0's auc: 0.978486
[420]	valid_0's auc: 0.978565
[440]	valid_0's auc: 0.978557
Early stopping, best iteration is:
[430]	valid_0's auc: 0.978633
ROC score is 0.9786327608982825
24
identity_hate
10727
{'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': 'auc', 'verbosity': -1, 'bagging_freq': 1, 'num_threads': 20, 'early_stopping_round': 20, 'bagging_fraction': 0.69999999999999996, 'feature_fraction': 0.59999999999999998, 'lambda_l1': 0.0, 'lambda_l2': 0.0, 'learning_rate': 0.10000000000000001, 'max_depth': 4, 'num_iterations': 1200, 'num_leaves': 31}
Training until validation scores don't improve for 20 rounds.
[20]	valid_0's auc: 0.871662
[40]	valid_0's auc: 0.

KeyboardInterrupt: 

In [None]:
sss

In [None]:
df_lgb.to_csv(PATH + 'lgb_all_gridsearch.csv', index=False) 
print('done')

In [None]:
##############################
lgb_train_set = {}
lgb_eval_set = {}
gbm_model = {}
y_pred = pd.DataFrame()
y_pred['id'] = test['id']
for col in label_cols:
    # create dataset for lightgbm


    params = {
    'learning_rate': 0.2,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary', 'auc'},
    'num_leaves': 31,
    'verbosity': -1,
    'metric': 'auc',
    'data_random_seed': 2,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.6,
    'bagging_freq': 1,
    'nthread': 12,
    'lambda_l1': 1,
    'lambda_l2': 1,
    'early_stopping_round':10
    } 

    print(col)
    lgb_train_set[col] = lgb.Dataset(train_set[col], y_train_df[col].values)
    lgb_eval_set[col] = lgb.Dataset(train_set[col], y_train_df[col].values, reference=lgb_train_set[col])
    

    gbm_model[col] = lgb.train(params,
                        lgb_train_set[col],
                        valid_sets=lgb_eval_set[col],
                       verbose_eval=20)
    y_pred[col] =  gbm_model[col].predict(test_set[col])

In [None]:
y_pred.to_csv(PATH + 'lgbm_nb_tfidf.csv', index=False)