In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.max_rows = 999
pd.options.display.max_columns = 500
pd.options.display.max_colwidth = 200

import gc

In [2]:
%%time
# load data dict to know column description
# ** CASA - Current + Savings Account
# *** AMB - Average Monthly Balances
data_dict = pd.read_excel('data/data_dict.xlsx')
data_dict.set_index('VARIABLE', inplace=True)

# load test data
df_test = pd.read_csv('data/test.csv', engine='c', sep=',')
print('data shape: {}'.format(df_test.shape))

# load train data
df_train = pd.read_csv('data/train.csv', engine='c', sep=',')
print('data shape: {}'.format(df_train.shape))

# check sample
df_train.head()



data shape: (200000, 332)




data shape: (300000, 333)
Wall time: 10.5 s


In [3]:
# define target, recode to 0/1
df_train.RESPONDERS = df_train.RESPONDERS.map({'N':0, 'Y':1}).astype(np.float32)
# check for balance
df_train.RESPONDERS.value_counts(normalize=True)
# oops, highly unbalanced dataset
proportion = (df_train.RESPONDERS[df_train.RESPONDERS == 0].count() / 
              df_train.RESPONDERS[df_train.RESPONDERS == 1].count()
             )

In [4]:
# Move customer id to index
df_train.set_index('CUSTOMER_ID', inplace=True)
df_test.set_index('CUSTOMER_ID', inplace=True)
# X columns
X_cols = df_train.drop('RESPONDERS', axis=1).columns
# check numerical
num_cols = df_train.select_dtypes(include=[np.number]).columns
print('numerical columns cnt: {}'.format(df_train[num_cols].shape[1]))
# check categorical/ordinal etc.
cat_cols = df_train.select_dtypes(exclude=[np.number]).columns
print('non-numerical columns cnt: {}'.format(df_train[cat_cols].shape[1]))

numerical columns cnt: 240
non-numerical columns cnt: 92


In [5]:
# check for null columns (share, %)
null_stats = (df_train.isnull().sum() / df_train.shape[0] * 100).sort_values(ascending=False)
null_cols = set(null_stats[null_stats == 100].index.tolist())
print(null_cols)
# check their description
data_dict.loc[null_cols, :]

{'PM_FD_MON_02', 'MER_EMI_CLOSED_MON_01', 'PM_FD_MON_04', 'EEG_CLOSED', 'STMT_CON_DAE_ACTIVE_MON_01', 'EEG_TAG', 'STMT_CON_DAE_CLOSED_MON_01'}


Unnamed: 0_level_0,DESCRIPTIONS
VARIABLE,Unnamed: 1_level_1
PM_FD_MON_02,WHETHER CLOSED FIXED DEPOSIT BEFORE MATURITY IN MON_03
MER_EMI_CLOSED_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS CLOSED AS ON MON1
PM_FD_MON_04,WHETHER CLOSED FIXED DEPOSIT BEFORE MATURITY IN MON_04
EEG_CLOSED,CLOSED EMERGING ENTERPRISE GROUP HOLDING WITH BANK IN PAST ONE YEAR
STMT_CON_DAE_ACTIVE_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS ACTIVE AS ON MON1
EEG_TAG,LIVE EMERGING ENTERPRISE GROUP HOLDING WITH BANK
STMT_CON_DAE_CLOSED_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS CLOSED AS ON MON1


In [6]:
# check for constant columns
# numerical
stats_num = df_train[num_cols].fillna(-999.99).apply(pd.Series.nunique, axis=0)
const_num = stats_num[stats_num == 1].index.tolist()
# categorical
stats_cat = df_train[cat_cols].fillna('is_nan').apply(pd.Series.nunique, axis=0)
const_cat = stats_cat[stats_cat == 1].index.tolist()
# join both
const_cols = set(const_num + const_cat)
# check their description
print('constant columns found: {}, numerical: {}, categorical: {}'.format(
    len(const_cols), len(const_num), len(const_cat)))
data_dict.loc[const_cols, :]

constant columns found: 8, numerical: 7, categorical: 1


Unnamed: 0_level_0,DESCRIPTIONS
VARIABLE,Unnamed: 1_level_1
PM_FD_MON_02,WHETHER CLOSED FIXED DEPOSIT BEFORE MATURITY IN MON_03
MER_EMI_CLOSED_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS CLOSED AS ON MON1
PM_FD_MON_04,WHETHER CLOSED FIXED DEPOSIT BEFORE MATURITY IN MON_04
EEG_CLOSED,CLOSED EMERGING ENTERPRISE GROUP HOLDING WITH BANK IN PAST ONE YEAR
STMT_CON_DAE_ACTIVE_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS ACTIVE AS ON MON1
OCCUP_ALL_NEW,OCCUPATION IS SALARIED FOR ALL THE CUSTOMERS IN THE BASE
EEG_TAG,LIVE EMERGING ENTERPRISE GROUP HOLDING WITH BANK
STMT_CON_DAE_CLOSED_MON_01,THIS LOAN TYPE IS OFFERED TO CREDIT CARD HOLDERS AND LOAN IS CLOSED AS ON MON1


In [7]:
# drop useless columns
print(df_train.shape[1] - 1, df_test.shape[1])
# add zipcode, as it has too many values (35k+)
cols_to_drop = null_cols.union(const_cols).union(set(['ZIP_CODE_FINAL']))
df_train.drop(cols_to_drop, axis=1, errors='ignore', inplace=True)
df_test.drop(cols_to_drop, axis=1, errors='ignore', inplace=True)
print(df_train.shape[1] - 1, df_test.shape[1])

# update categorical/numerical column list
cat_cols = [c for c in cat_cols if c not in cols_to_drop]
num_cols = [c for c in num_cols if c not in cols_to_drop]

331 331
322 322


In [8]:
# ML supportives
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, roc_auc_score, f1_score
from sklearn.preprocessing import FunctionTransformer, LabelEncoder, OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline

# ML models
import lightgbm as lgb

In [9]:
%%time
# encode categorical data
from collections import defaultdict
d = defaultdict(LabelEncoder)

# fit and encode train/test
a = pd.concat([df_train[cat_cols], df_test[cat_cols]], axis=0).fillna('').apply(
    lambda x: d[x.name].fit(x))
# transform encodings to train
df_train[cat_cols] = df_train[cat_cols].fillna('').apply(lambda x: d[x.name].transform(x))
# transform encodings to test
df_test[cat_cols] = df_test[cat_cols].fillna('').apply(lambda x: d[x.name].transform(x))

Wall time: 49.8 s


In [10]:
# split on train/val
X_train, X_val, y_train, y_val = train_test_split(
    df_train.drop('RESPONDERS', axis=1).values.astype(np.float32),
    df_train.RESPONDERS.values.astype(np.float32),
    stratify=df_train.RESPONDERS,
    random_state=42,
    test_size=0.3
)
# check shapes
X_train.shape, y_train.shape, X_val.shape, y_val.shape

((210000, 322), (210000,), (90000, 322), (90000,))

In [19]:
%%time
# prepare model
model = lgb.LGBMClassifier(
    num_leaves=512, 
    max_depth=12, 
    learning_rate=0.02, 
    subsample=0.8,
    n_estimators=500,
    reg_lambda=10, 
    colsample_bytree=0.9,
    **{'device': 'gpu',
    'gpu_use_dp': False}
)

# prepare weights
w_train = pd.Series(y_train).map({0:1/proportion, 1: 1.0}).values
w_val = pd.Series(y_val).map({0:1/proportion, 1: 1.0}).values

# fit model
model.fit(X_train, y_train, 
          sample_weight=w_train, 
          eval_set=[(X_val, y_val)],
          eval_sample_weight=[w_val],
          early_stopping_rounds=5,
          verbose=50,
         )

Training until validation scores don't improve for 5 rounds.
[50]	valid_0's binary_logloss: 0.496758
[100]	valid_0's binary_logloss: 0.438774
[150]	valid_0's binary_logloss: 0.414902
[200]	valid_0's binary_logloss: 0.40319
[250]	valid_0's binary_logloss: 0.396488
[300]	valid_0's binary_logloss: 0.393453
Early stopping, best iteration is:
[325]	valid_0's binary_logloss: 0.392646
Wall time: 15.6 s


In [31]:
# check persofmance
print(classification_report(y_val, model.predict(X_val)))

             precision    recall  f1-score   support

        0.0       1.00      0.85      0.92     88616
        1.0       0.08      0.80      0.14      1384

avg / total       0.98      0.85      0.91     90000



In [33]:
# prepare and save submission
df_test['RESPONDERS'] = model.predict_proba(df_test.values)[:, 1]
df_test.reset_index()[['CUSTOMER_ID', 'RESPONDERS']].to_csv(
    'submissions/baseline.csv', encoding='utf-8', index=False)