In [1]:
%load_ext autoreload
%autoreload 2
is_debug = 0

import gc
import os
import sys
import gzip
import pickle
from tqdm import tqdm
from glob import glob
import numpy as np
import pandas as pd
from collections import defaultdict
from joblib import Parallel, delayed
from pathlib import Path

# ORIGINAL
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb


def read_pkl(path):
    with open(path, 'rb') as f:
        obj = pickle.load(f)
        print(f"""
#==============================================================================
# PICKLE READ SUCCESS !!! {path}
#==============================================================================
""")
        return obj
    
    
def read_pkl_gzip(path):
    with gzip.open(path, mode='rb') as fp:
        data = fp.read()
    return pickle.loads(data)

In [2]:
#========================================================================
# Raw Data Load
#========================================================================
COL_TEXT = 'comment_text'
COL_TARGET = 'target'
IDENTITY_COLUMNS = [
    'male', 'female', 'homosexual_gay_or_lesbian', 'christian', 'jewish',
    'muslim', 'black', 'white', 'psychiatric_or_mental_illness'
]

train_df = pd.read_csv(f'../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv')
test_df  = pd.read_csv(f'../input/jigsaw-unintended-bias-in-toxicity-classification/test.csv')

Y = (train_df[COL_TARGET].values >= 0.5).astype(int)
Y_identity = (train_df[IDENTITY_COLUMNS].values >= 0.5).astype(int)

  from ipykernel import kernelapp as app


In [3]:
#========================================================================
# BERT Feature Load
#========================================================================
TRAIN_BERT_PATH = '../input/BERT_Embedding_last_layer.csv'
TEST_BERT_PATH  = '../input/BERT_Embedding_last_layer_test.pkl'

train_bert = pd.read_csv(TRAIN_BERT_PATH)
test_bert = read_pkl(TEST_BERT_PATH)
print(train_bert.shape, test_bert.shape)

# column name 変更
use_cols = [f"c{col}" for col in train_bert.columns if len(col)<4]
train_bert.columns = use_cols
test_bert.columns = use_cols

display(test_bert.head())


# PICKLE READ SUCCESS !!! ../input/BERT_Embedding_last_layer_test.pkl

(1804874, 768) (97320, 768)


Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,...,c758,c759,c760,c761,c762,c763,c764,c765,c766,c767
0,0.593786,0.751176,0.678374,-0.665478,-0.76789,0.757234,0.548852,0.894564,-0.900607,0.338016,...,0.528895,-0.662602,-0.279407,0.857723,0.7756,0.611303,-0.695147,-0.107985,-0.91058,0.468436
1,0.71266,0.79156,0.794963,-0.822454,-0.573316,0.783753,0.654398,0.901379,-0.901977,0.724705,...,0.696298,-0.740129,-0.234,0.912704,0.944725,0.795998,-0.784505,-0.651504,-0.916386,0.805378
2,0.453265,0.519735,0.397969,-0.443882,-0.488378,0.475402,-0.040196,0.764921,-0.723267,0.518561,...,0.288628,-0.423755,-0.300136,0.609321,0.496349,0.626067,-0.590965,-0.321538,-0.793758,0.285966
3,0.625689,0.630979,0.433704,-0.755718,-0.652297,0.660812,0.337131,0.885929,-0.848946,-0.144282,...,0.500111,-0.621848,-0.158864,0.730941,0.888308,0.586708,-0.618545,-0.243724,-0.893374,0.38287
4,0.215665,0.277835,-0.295362,0.356121,-0.731135,0.039645,0.314578,0.487203,-0.461512,0.209076,...,-0.053373,0.211464,0.644585,0.553524,-0.046587,-0.090339,0.002103,0.457281,-0.753947,-0.024807


In [4]:
# subgroup negative weighting
subgroup_bool_train = train_df[IDENTITY_COLUMNS].fillna(0) >= 0.5
toxic_bool_train = train_df['target'].fillna(0) >= 0.5
subgroup_negative_mask = subgroup_bool_train.values.sum(axis=1).astype(bool) & ~toxic_bool_train

In [5]:
class JigsawEvaluator():
    def __init__(self, y_true, y_identity, power=-5, overall_model_weight=0.25):
        self.y = y_true
        self.y_i = y_identity
        self.n_subgroups = self.y_i.shape[1]
        self.power = power
        self.overall_model_weight = overall_model_weight

    @staticmethod
    def _compute_auc(y_true, y_pred):
        try:
            return roc_auc_score(y_true, y_pred)
        except ValueError:
            return np.nan

    def _compute_subgroup_auc(self, i, y_pred):
        mask = self.y_i[:, i] == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bpsn_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y == 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def _compute_bnsp_auc(self, i, y_pred):
        mask = self.y_i[:, i] + self.y != 1
        return self._compute_auc(self.y[mask], y_pred[mask])

    def compute_bias_metrics_for_model(self, y_pred):
        records = np.zeros((3, self.n_subgroups))
        for i in range(self.n_subgroups):
            records[0, i] = self._compute_subgroup_auc(i, y_pred)
            records[1, i] = self._compute_bpsn_auc(i, y_pred)
            records[2, i] = self._compute_bnsp_auc(i, y_pred)
        return records

    def _calculate_overall_auc(self, y_pred):
        return roc_auc_score(self.y, y_pred)

    def _power_mean(self, array):
        total = sum(np.power(array, self.power))
        return np.power(total / len(array), 1 / self.power)

    def get_final_metric(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        bias_score = np.average([
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ])
        overall_score = self.overall_model_weight * self._calculate_overall_auc(y_pred)
        bias_score = (1 - self.overall_model_weight) * bias_score
        return overall_score + bias_score
    
    # NOTE: add original func
    def get_all_score(self, y_pred):
        bias_metrics = self.compute_bias_metrics_for_model(y_pred)
        power_means = [
            self._power_mean(bias_metrics[0]),
            self._power_mean(bias_metrics[1]),
            self._power_mean(bias_metrics[2])
        ]
        bias_score    = np.average(power_means)
        overall_auc   = self._calculate_overall_auc(y_pred)
        overall_score = self.overall_model_weight * overall_auc
        bias_score    = (1 - self.overall_model_weight) * bias_score
        return {
            'overall_auc': overall_auc,
            'subgroup_auc': power_means[0],
            'bpsn_auc': power_means[1],
            'bnsp_auc': power_means[2],
            'final_metrics': overall_score + bias_score,
        }

In [6]:
def Classifier(
        x_train
        , x_val
        , y_train
        , y_val
        , x_test = []
        , params={}
        , metric = 'auc'
        , early_stopping_rounds=100
        , num_boost_round=10000
        , weight_list=[] ):

    #========================================================================
    # Fitting
    #========================================================================
    if len(weight_list):
        lgb_train = lgb.Dataset(data=x_train, label=y_train, weight=weight_list[0])
        lgb_val = lgb.Dataset(data=x_val, label=y_val, weight=weight_list[1])
    else:
        lgb_train = lgb.Dataset(data=x_train, label=y_train)
        lgb_val = lgb.Dataset(data=x_val, label=y_val)
    #  cat_cols = utils.get_categorical_features(df=x_train)
    cat_cols = []

    estimator = lgb.train(
        params = params
        ,train_set = lgb_train
        ,valid_sets = lgb_val
        ,early_stopping_rounds = early_stopping_rounds
        ,num_boost_round = num_boost_round
        ,categorical_feature = cat_cols
        ,verbose_eval = 100
    )

    #========================================================================
    # Prediction
    #========================================================================
    oof_pred = estimator.predict(x_val)
    if len(x_test):
        test_pred = estimator.predict(x_test)
    else:
        test_pred = []

    score = roc_auc_score(y_val, oof_pred)
    
    return score, oof_pred, test_pred, estimator

In [35]:
# train_meta = read_pkl_gzip('../input/109_toxic_train_meta_feature32.gz')
# test_meta = read_pkl_gzip('../input/109_toxic_test_meta_feature32.gz')
# meta_cols = [f"meta{i}" for i in range(train_meta.shape[1])]
# for i, col in enumerate(meta_cols):
#     train_bert[col] = train_meta[:, i]
#     test_bert[col]  = test_meta[:, i]


In [147]:
pd.Series(weights).value_counts()

 1    1318140
 3     374840
-1     111894
dtype: int64

In [146]:
weights = np.ones((len(train_df), ))
ratio = 2.0
weights += subgroup_negative_mask * ratio

# use_cols = [col for col in train_bert.columns]
use_cols = [col for col in train_bert.columns if col[0]=='c' and len(col)<=4]

train = train_bert

seed = 1208
n_splits = 5
kfold = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed).split(train, Y))

early_stopping_rounds = 30
params = {
    'n_jobs': 64,
    'objective': 'binary',
    'metric':'auc',
    'boosting': 'gbdt',
    'num_leaves': 20,
    'max_depth': -1,
    'learning_rate': 0.1,
    'subsample': 0.9,
    'colsample_bytree': 0.2,
    'lambda_l2': 5.0,
    'random_seed': seed,
    'bagging_seed': seed,
    'feature_fraction_seed': seed,
    'data_random_seed': seed,
}

score_list     = []
test_pred_list = []
final_list     = []
train_pred = np.zeros(len(train))

for fold, (train_idx, valid_idx) in enumerate(kfold):
    
    x_train = train.loc[train_idx][use_cols]
    y_train = Y[train_idx]
    x_valid = train.loc[valid_idx][use_cols]
    y_valid = Y[valid_idx]
    w_train = weights[train_idx]
    w_valid = weights[valid_idx]
    
    print(x_train.shape, x_valid.shape)
    
    score, oof_pred, test_pred, model = Classifier(
        x_train,
        x_valid,
        y_train,
        y_valid,
        test_bert[use_cols],
        params=params,
        early_stopping_rounds=early_stopping_rounds,
        weight_list = [w_train, w_valid]
    )
    score_list.append(score)
    best_iter = model.best_iteration
    test_pred_list.append(test_pred)
    train_pred[valid_idx] = oof_pred
                     
    evaluator = JigsawEvaluator(y_valid, Y_identity[valid_idx, :])
    auc_score = evaluator.get_all_score(oof_pred)
    display(pd.Series(auc_score))
    final_score = auc_score['final_metrics']
    final_list.append(final_score)
    
    sys.exit()
    model.save_model(f"../model/LGB_Fold{fold}_Score{str(final_score)[:6]}_leaves{params['num_leaves']}_lr{params['learning_rate']}", num_iteration=best_iter)
                     
final_score = np.mean(final_list)
print(final_score)

(1443899, 768) (360975, 768)


New categorical_feature is []
  'New categorical_feature is {}'.format(sorted(list(categorical_feature))))


Training until validation scores don't improve for 30 rounds.
Early stopping, best iteration is:
[1]	valid_0's auc: 1


overall_auc      0.5
subgroup_auc     0.5
bpsn_auc         0.5
bnsp_auc         0.5
final_metrics    0.5
dtype: float64

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [40]:
test_id_list = test_df['id'].values
prediction = np.mean(test_pred_list, axis=0)
submission = pd.DataFrame.from_dict({
    'id': test_id_list,
    'prediction': prediction
})
submission.to_csv(f"../output/submit_LGB_CV{str(final_score)[:7]}_leaves{params['num_leaves']}_lr{params['learning_rate']}_weight{ratio}.csv", index=False)

In [39]:
# train_ratio1 = train_pred.copy()
# test_ratio1 = prediction.copy()
train_ratio2 = train_pred.copy()
test_ratio2 = prediction.copy()

In [118]:
# train_bert[df_oof.columns] = df_oof.reset_index(drop=True).loc[:num_train]
# train_bert.head()
test_bert[df_oof.columns] = df_oof.reset_index(drop=True).loc[num_train:]

In [121]:
# Hack
import glob

oof_list = glob.glob('../feature/*OOF*')

df_oof = pd.DataFrame()
for path in oof_list:
    if path.count('insult'):
        continue
    oof = pd.read_csv(path).set_index('id')
    if len(df_oof)==0:
        df_oof = oof.copy()
        continue
    df_oof = df_oof.join(oof)
df_oof.head()

Unnamed: 0_level_0,muslim,male,female,white,christian,black,jewish
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
59856,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239579,0.0,0.0,0.0,0.0,0.0,0.0,0.0
239592,5.6e-05,0.0,0.005743,9e-05,1.5e-05,0.000114,7e-06
239593,0.0,0.000271,0.0,0.0,0.0,0.0,0.0
239607,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
for col in df_oof.columns:
    tmp = df_oof[col].round(3).value_counts()
    display(tmp)

0.000    1863361
0.001      18877
0.002       4048
0.003       1858
0.004       1007
0.005        634
0.006        428
0.008        355
0.007        280
0.009        253
0.999        249
0.995        245
0.998        236
0.997        224
0.993        221
0.996        212
0.994        208
0.991        203
0.992        195
0.010        174
0.990        170
0.011        146
0.012        141
0.988        139
0.989        138
0.987        134
0.986        127
0.983        126
0.013        110
0.984        105
          ...   
0.313          1
0.896          1
0.647          1
0.806          1
0.632          1
0.132          1
0.812          1
0.481          1
0.811          1
0.759          1
0.502          1
0.775          1
0.523          1
0.740          1
0.581          1
0.747          1
0.850          1
0.559          1
0.424          1
0.712          1
0.622          1
0.472          1
0.795          1
0.754          1
0.509          1
0.470          1
0.698          1
0.364         

0.000    1594405
0.001     159259
0.002      24335
0.003      10079
0.004       5780
0.005       3986
0.006       2968
0.007       2335
0.008       1967
0.009       1776
0.010       1598
0.011       1438
0.012       1398
0.013       1242
0.014       1174
0.015       1118
0.016       1085
0.017       1025
0.018        955
0.019        883
0.020        866
0.022        825
0.021        794
0.024        753
0.023        746
0.025        702
0.026        696
0.028        649
0.030        621
0.029        599
          ...   
0.405         10
0.322         10
0.440          9
0.469          9
0.304          9
0.319          9
0.383          9
0.283          9
0.363          9
0.342          9
0.438          9
0.375          9
0.471          9
0.276          9
0.308          9
0.335          8
0.391          8
0.433          8
0.279          8
0.357          8
0.344          8
0.262          8
0.413          7
0.431          7
0.360          7
0.456          6
0.386          6
0.371         

0.000    1698217
0.001     101253
0.002      20532
0.003      10038
0.004       5970
0.005       4302
0.006       3255
0.007       2594
0.008       2017
0.998       1899
0.009       1842
0.999       1710
0.997       1571
0.010       1540
0.996       1342
0.011       1236
0.012       1201
0.995       1156
0.013       1128
0.994        957
0.015        954
0.014        941
0.017        876
0.016        838
0.993        754
0.018        746
0.019        709
0.992        644
0.020        632
0.021        573
          ...   
0.803          2
0.796          2
0.742          1
0.760          1
0.857          1
0.716          1
0.809          1
0.719          1
0.675          1
0.634          1
0.679          1
0.690          1
0.672          1
0.737          1
0.785          1
0.725          1
0.620          1
0.595          1
0.812          1
0.691          1
0.594          1
0.734          1
0.854          1
0.700          1
0.600          1
0.775          1
0.681          1
0.819         

0.000    1859726
0.001      15426
0.002       2154
0.003        750
0.999        471
0.004        423
0.998        397
0.997        317
0.996        282
0.005        279
1.000        267
0.994        262
0.992        257
0.991        251
0.995        236
0.989        232
0.988        230
0.993        228
0.990        227
0.006        222
0.987        214
0.986        209
0.983        203
0.985        202
0.984        191
0.979        171
0.981        171
0.980        161
0.982        161
0.978        149
          ...   
0.764          2
0.516          2
0.432          2
0.722          2
0.638          2
0.363          2
0.414          2
0.393          2
0.387          2
0.439          2
0.400          2
0.500          2
0.404          2
0.534          2
0.707          2
0.220          2
0.662          2
0.351          2
0.444          2
0.358          1
0.630          1
0.700          1
0.329          1
0.616          1
0.507          1
0.735          1
0.631          1
0.475         

0.000    1838904
0.001      12005
0.002       4136
0.003       2597
0.004       1721
0.005       1459
0.006       1157
0.007        956
0.008        904
0.998        829
0.997        714
0.009        714
0.010        681
0.011        584
0.996        540
0.012        522
0.013        511
0.995        484
0.014        484
0.015        474
0.999        427
0.017        391
0.016        389
0.994        357
0.993        355
0.018        341
0.020        320
0.019        319
0.021        317
0.022        309
          ...   
0.485          4
0.511          4
0.464          4
0.629          4
0.554          4
0.527          4
0.699          4
0.698          4
0.596          4
0.774          4
0.502          4
0.631          4
0.616          3
0.662          3
0.661          3
0.696          3
0.580          3
0.657          3
0.749          3
1.000          3
0.810          3
0.533          3
0.571          3
0.878          3
0.759          3
0.469          2
0.547          2
0.603         

0.000    1869014
0.001      12761
0.002       2586
0.003       1259
0.004        714
0.005        469
0.006        386
0.007        277
0.008        230
0.009        194
0.010        168
0.011        146
0.988        140
0.990        139
0.997        138
0.999        137
0.993        135
0.986        133
0.992        131
0.991        130
0.994        129
0.995        129
0.998        121
0.996        121
0.982        118
0.983        116
0.987        115
0.989        114
0.981        110
0.980        109
          ...   
0.125          3
0.616          3
0.632          2
0.657          2
0.681          2
0.138          2
0.097          2
0.168          2
0.551          2
0.777          2
0.692          2
0.604          2
0.099          2
0.639          2
0.457          2
0.594          2
0.157          2
0.574          2
0.474          2
0.545          2
0.117          1
0.705          1
0.568          1
0.699          1
0.648          1
0.787          1
0.691          1
0.761         

0.000    1893183
0.001       3132
0.002       1035
0.003        558
0.004        308
0.005        202
0.992        147
0.991        144
0.006        143
0.990        138
0.007        134
0.989        127
0.988        119
0.994        102
0.008         96
0.993         92
0.009         81
0.987         76
0.986         75
0.983         72
0.995         72
0.012         66
0.011         63
0.985         63
0.996         57
0.010         57
0.984         51
0.981         49
0.982         46
0.014         45
          ...   
0.477          1
0.662          1
0.161          1
0.088          1
0.641          1
0.600          1
0.286          1
0.946          1
0.072          1
0.833          1
0.154          1
0.792          1
0.189          1
0.619          1
0.843          1
0.827          1
0.069          1
0.102          1
0.163          1
0.466          1
0.839          1
0.732          1
0.073          1
0.113          1
0.409          1
0.901          1
0.742          1
0.355         

0.000    1642495
0.001      62368
0.002      21373
0.003      12406
0.004       8361
0.005       6550
0.006       5259
0.007       4429
0.008       3861
0.009       3391
0.010       2923
0.011       2778
0.012       2504
0.013       2371
0.014       2122
0.015       2006
0.016       1832
0.017       1758
0.018       1630
0.019       1549
0.020       1434
0.021       1366
0.022       1231
0.023       1221
0.025       1159
0.024       1152
0.026       1106
0.027       1036
0.998        972
0.029        964
          ...   
0.900         35
0.660         34
0.891         34
0.655         34
0.712         34
0.685         34
0.566         34
0.589         33
0.916         33
0.715         33
0.722         33
0.681         33
0.910         33
0.637         32
0.763         32
0.674         32
0.765         32
0.853         31
0.859         31
0.724         31
0.530         31
0.914         31
0.675         31
0.767         30
0.817         30
0.774         28
0.907         28
0.603         

In [101]:
num_train = len(train_df)
identity = (df_oof!=0).sum(axis=1)
weight = (identity>0) * 1 + 1
train_norm_weight = (weight*-1.0 + 1.0)[:num_train]
train_iden_weight = (weight* 1.0 + 0.0)[:num_train]
# identity.value_counts()
# train_iden = identity[:num_train]
# train_iden_idx = (train_iden==0).values
# test_iden = identity[num_train:]
# test_iden_idx = (test_iden==0).values

In [43]:
evaluator = JigsawEvaluator(Y, Y_identity)
auc_score = evaluator.get_all_score(train_ratio1)
display(pd.Series(auc_score))
auc_score = evaluator.get_all_score(train_ratio2)
display(pd.Series(auc_score))

overall_auc      0.982006
subgroup_auc     0.935437
bpsn_auc         0.956827
bnsp_auc         0.973649
final_metrics    0.961980
dtype: float64

overall_auc      0.981452
subgroup_auc     0.935289
bpsn_auc         0.963924
bnsp_auc         0.968580
final_metrics    0.962311
dtype: float64

In [114]:
blend = (train_ratio1*0.3 + train_ratio2*0.7)
auc_score = evaluator.get_all_score(blend)
display(pd.Series(auc_score))

overall_auc      0.981703
subgroup_auc     0.935559
bpsn_auc         0.961939
bnsp_auc         0.970369
final_metrics    0.962392
dtype: float64

In [110]:
weight_list = np.arange(0.00002, 0.001, 0.00005) 

for w in weight_list:
    identity = (df_oof>=w).sum(axis=1)
    weight = (identity>0) * 1
    train_norm_weight = (weight*-0.4 + 0.7)[:num_train]
    train_iden_weight = (weight* 0.4 + 0.3)[:num_train]
    
    blend = pd.Series(train_ratio2).rank(method='min').values * train_norm_weight + pd.Series(train_ratio1).rank(method='min').values * train_iden_weight
    auc_score = evaluator.get_all_score(blend)
    display(pd.Series(auc_score))

overall_auc      0.981902
subgroup_auc     0.935612
bpsn_auc         0.959399
bnsp_auc         0.972184
final_metrics    0.962274
dtype: float64

overall_auc      0.981891
subgroup_auc     0.935590
bpsn_auc         0.959533
bnsp_auc         0.972085
final_metrics    0.962275
dtype: float64

KeyboardInterrupt: 

In [None]:
def get_tree_importance(estimator, use_cols):
    feim = estimator.feature_importance(importance_type='gain')
    feim = pd.DataFrame([np.array(use_cols), feim]).T
    feim.columns = ['feature', 'importance']
    feim['importance'] = feim['importance'].astype('float32')
    return feim

# feim = get_tree_importance(model, use_cols)
pd.set_option('max_rows', 500)
feim.sort_values(by='importance', ascending=False).reset_index(drop=True)