In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings("ignore")

In [2]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
#data path.
PATH = '/kaggle/input/playground-series-s3e4/'
os.listdir(PATH)

/kaggle/input/playground-series-s3e4/sample_submission.csv
/kaggle/input/playground-series-s3e4/train.csv
/kaggle/input/playground-series-s3e4/test.csv


['sample_submission.csv', 'train.csv', 'test.csv']

In [3]:
train = pd.read_csv(PATH + 'train.csv')
test = pd.read_csv(PATH + 'test.csv')

In [4]:
print(train.columns)

Index(['id', 'Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9',
       'V10', 'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19',
       'V20', 'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount',
       'Class'],
      dtype='object')


In [5]:
class Data(object):
    def __init__(self, TRAIN, TEST, ID, TARGET, PREPROCESSING_PARAMS={}):
        if 'drop' in PREPROCESSING_PARAMS:
            TRAIN.drop(columns=PREPROCESSING_PARAMS['drop'], inplace=True)
            TEST.drop(columns=PREPROCESSING_PARAMS['drop'], inplace=True)
            print("Columns " + ' '.join(map(str, PREPROCESSING_PARAMS['drop'])))

        if 'fillna' in PREPROCESSING_PARAMS:
           col = [c for c in TRAIN.columns if c not in [ID, TARGET]]
           imp = PREPROCESSING_PARAMS['fillna']
           for c in col:
               if PREPROCESSING_PARAMS['fillna'] == 'mean':
                   imp = np.mean(TRAIN[c])
               if PREPROCESSING_PARAMS['fillna'] == 'median':
                   imp = np.median(TRAIN[c])

               TRAIN[c].fillna(imp, inplace=True)
               TEST[c].fillna(imp, inplace=True)
           print("fillna complete")

        self.TRAIN = TRAIN
        self.TEST = TEST
        self.ID = ID
        self.TARGET = TARGET

class Model(object):
    def __init__(self, DATA, MODEL='ETR', PARAMS={}, TEST_SIZE = 0.25, RANDOM_STATE=5):
        from sklearn import  model_selection
        col = [c for c in DATA.TRAIN.columns if c not in [DATA.ID,DATA.TARGET]]
        X1, X2, Y1, Y2 =  model_selection.train_test_split(DATA.TRAIN[col], DATA.TRAIN[DATA.TARGET], test_size=TEST_SIZE, random_state=RANDOM_STATE)
        if MODEL in ['ETR']:
            from sklearn import ensemble
            LIB = ensemble.ExtraTreesRegressor(n_jobs=-1, random_state = RANDOM_STATE)
            PARAMS_ = LIB.get_params()
            for p in PARAMS:
                if p in PARAMS_:
                    LIB.set_params({p: PARAMS[p]})
            LIB.fit(DATA.TRAIN[col], DATA.TRAIN[DATA.TARGET])
            DATA.TEST[DATA.TARGET] = LIB.predict(DATA.TEST[col])
            self.PRED = DATA.TEST[[DATA.ID, DATA.TARGET]]
        elif MODEL in ['XGB']:
            import xgboost as xgb
            default_params = {'num_round': 20, 'verbose_eval': 10, 'early_stopping_rounds': 20}

            if PARAMS == {}:
                PARAMS = {'eta': 0.2, 'max_depth': 4, 'objective': 'reg:linear', 'eval_metric': 'rmse', 'seed': RANDOM_STATE, 'silent': True, 'num_round': 20, 'verbose_eval': 10, 'early_stopping_rounds': 20}
            if 'num_round' in PARAMS:
                default_params['num_round'] = PARAMS['num_round'] 
            if 'verbose_eval' in PARAMS:
                default_params['verbose_eval'] = PARAMS['verbose_eval'] 
            if 'early_stopping_rounds' in PARAMS:
                default_params['early_stopping_rounds'] = PARAMS['early_stopping_rounds'] 

            def xgb_rmse(preds, y):
                y = y.get_label()
                score = np.sqrt(metrics.mean_squared_error(y, preds))
                return 'RMSE', score

            watchlist = [(xgb.DMatrix(X1, Y1), 'train'), (xgb.DMatrix(X2, Y2), 'valid')] 
            LIB = xgb.train(PARAMS, xgb.DMatrix(X1, Y1), default_params['num_round'],  watchlist, verbose_eval=default_params['verbose_eval'], early_stopping_rounds=default_params['early_stopping_rounds']) #feval=xgb_rmse, maximize=False
            DATA.TEST[DATA.TARGET] = LIB.predict(xgb.DMatrix(DATA.TEST[col]), ntree_limit=LIB.best_ntree_limit)
            self.PRED = DATA.TEST[[DATA.ID, DATA.TARGET]].copy()

        elif MODEL in ['LGB']:
            import lightgbm as lgb
            default_params = {'verbose_eval': 10}

            if PARAMS == {}:
                PARAMS = {'learning_rate': 0.2, 'max_depth': 7, 'boosting': 'gbdt', 'objective': 'regression', 'metric':'rmse', 'seed': RANDOM_STATE, 'num_iterations': 100, 'early_stopping_round': 20}
            if 'verbose_eval' in PARAMS:
                default_params['verbose_eval'] = PARAMS['verbose_eval']

            def lgb_rmse(preds, y):
                y = np.array(list(y.get_label()))
                score = np.sqrt(metrics.mean_squared_error(y, preds))
                return 'RMSE', score, False
            def F1score(preds, eval_data):
                preds = 1. / (1. + np.exp(-preds))
                preds = [1 if item > 0.5 else 0 for item in preds]
                eval_data = np.array(list(eval_data.get_label()))
                report = classification_report(preds, eval_data, output_dict=True)
                score = 1-report['macro avg']['f1-score'] #report['1']['f1-score']
                return '1-F1score', score, False

            LIB = lgb.train(PARAMS, lgb.Dataset(X1, label=Y1), valid_sets=lgb.Dataset(X2, label=Y2), verbose_eval=default_params['verbose_eval'], feval=F1score) #
            DATA.TEST[DATA.TARGET] = LIB.predict(DATA.TEST[col], num_iteration=LIB.best_iteration)
            self.PRED = DATA.TEST[[DATA.ID, DATA.TARGET]].copy()

        elif MODEL in ['CB']:
            from catboost import CatBoostRegressor
            default_params = {'iterations': 100, 'learning_rate': 0.2, 'depth': 7, 'loss_function': 'RMSE', 'eval_metric':'RMSE', 'od_type': 'Iter', 'od_wait': 20, 'verbose':False}

            if PARAMS == {}:
                PARAMS = {'iterations': 100, 'learning_rate': 0.2, 'depth': 7, 'loss_function': 'RMSE', 'eval_metric':'RMSE', 'od_type': 'Iter', 'od_wait': 20, 'verbose':False}
            for p in default_params:
                if p not in PARAMS:
                    PARAMS[p] = default_params[p]

            LIB = CatBoostRegressor(iterations=PARAMS['iterations'], learning_rate=PARAMS['learning_rate'], depth=PARAMS['depth'], loss_function=PARAMS['loss_function'], eval_metric=PARAMS['eval_metric'], random_seed=RANDOM_STATE, od_type=PARAMS['od_type'], od_wait=PARAMS['od_wait'])
            LIB.fit(X1, Y1, eval_set=(X2, Y2), use_best_model=True, verbose=PARAMS['verbose'])
            DATA.TEST[DATA.TARGET] = LIB.predict(DATA.TEST[col])
            self.PRED = DATA.TEST[[DATA.ID, DATA.TARGET]].copy()

        else:
            DATA.TEST[DATA.TARGET] = np.median(DATA.TRAIN[DATA.TARGET])
            self.PRED = DATA.TEST[[DATA.ID, DATA.TARGET]].copy()

In [6]:
data = Data(train, test, 'id', 'Class')
params = {'learning_rate': 0.003, 'max_depth': 10, 'boosting': 'gbdt', \
          'objective': 'binary', 'metric':'auc', 'seed': 4, \
          'num_iterations': 10000, 'early_stopping_round': 100, \
          'verbose_eval': 200, 'num_leaves': 40, \
          'feature_fraction': 0.8, \
          'bagging_fraction': 0.8, \
          'bagging_freq': 5}
preds = Model(data, 'LGB', params, 0.2, 4).PRED['Class'].clip(0,1)
preds = 1. / (1. + np.exp(-preds))
preds = [1 if item > 0.5 else 0 for item in preds]
test['Class'] = preds

[LightGBM] [Info] Number of positive: 371, number of negative: 174932
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 7650
[LightGBM] [Info] Number of data points in the train set: 175303, number of used features: 30
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.002116 -> initscore=-6.155951
[LightGBM] [Info] Start training from score -6.155951
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[1]	valid_0's auc: 0.606201	valid_0's 1-F1score: 0.997769


In [7]:
print(test['Class'])

0         1
1         1
2         1
3         1
4         1
         ..
146082    1
146083    1
146084    1
146085    1
146086    1
Name: Class, Length: 146087, dtype: int64


In [8]:
# use the sklearn api classification model
col = [c for c in train.columns if c not in ['id', 'Class']]
X1, X2, Y1, Y2 =  train_test_split(train[col], \
                            train['Class'], test_size=0.3, \
                            random_state=5)


LIB = lgb.LGBMClassifier(boosting_type='gbdt', \
                         num_leaves=40, \
                         max_depth=8,\
                         learning_rate=0.03, n_estimators=100, \
                         subsample_for_bin=20000, objective=None, \
                         class_weight=None, min_split_gain=0.0, \
                         min_child_weight=0.001, min_child_samples=20, \
                         subsample=1.0, subsample_freq=0, \
                         colsample_bytree=1.0,\
                         reg_alpha=0.0, reg_lambda=0.0, \
                         random_state=None, \
                         n_jobs=None, importance_type='split')

LIB.fit(X1, Y1, eval_set=(X2, Y2))
test['Class'] = LIB.predict(test[col], type='class')

[1]	valid_0's binary_logloss: 0.014794
[2]	valid_0's binary_logloss: 0.0147098
[3]	valid_0's binary_logloss: 0.0146414
[4]	valid_0's binary_logloss: 0.0145295
[5]	valid_0's binary_logloss: 0.0144656
[6]	valid_0's binary_logloss: 0.014406
[7]	valid_0's binary_logloss: 0.014371
[8]	valid_0's binary_logloss: 0.01434
[9]	valid_0's binary_logloss: 0.0143033
[10]	valid_0's binary_logloss: 0.0142744
[11]	valid_0's binary_logloss: 0.0142379
[12]	valid_0's binary_logloss: 0.0142257
[13]	valid_0's binary_logloss: 0.0141903
[14]	valid_0's binary_logloss: 0.0141685
[15]	valid_0's binary_logloss: 0.0141568
[16]	valid_0's binary_logloss: 0.0141259
[17]	valid_0's binary_logloss: 0.0141264
[18]	valid_0's binary_logloss: 0.0141238
[19]	valid_0's binary_logloss: 0.014085
[20]	valid_0's binary_logloss: 0.0140655
[21]	valid_0's binary_logloss: 0.0140564
[22]	valid_0's binary_logloss: 0.0139949
[23]	valid_0's binary_logloss: 0.0139847
[24]	valid_0's binary_logloss: 0.0139795
[25]	valid_0's binary_logloss: 

In [9]:
y_pred = LIB.predict(X2)
y_test = Y2

print(classification_report(y_test, y_pred))
#cm = confusion_matrix(y_test, y_pred)
#print('Confusion matrix\n\n', cm)
#print('\nTrue Positives(TP) = ', cm[0,0])
#print('\nTrue Negatives(TN) = ', cm[1,1])
#print('\nFalse Positives(FP) = ', cm[0,1])
#print('\nFalse Negatives(FN) = ', cm[1,0])



              precision    recall  f1-score   support

           0       1.00      1.00      1.00     65596
           1       0.21      0.03      0.06       143

    accuracy                           1.00     65739
   macro avg       0.60      0.52      0.53     65739
weighted avg       1.00      1.00      1.00     65739



In [10]:
report = classification_report(y_test, y_pred, output_dict=True)
print(report['macro avg']['f1-score'])
print(report['1']['f1-score'])

0.5293423023651351
0.059880239520958084
