In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

In [3]:
import warnings, gc
warnings.filterwarnings("ignore")

In [4]:
from lightgbm import plot_importance
from lightgbm import LGBMClassifier

In [5]:
%%time
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train = train.merge(label,how='inner',on="customer_ID")

CPU times: user 2min 22s, sys: 36.1 s, total: 2min 58s
Wall time: 3min


In [6]:
lab = LabelEncoder()
train['customer_ID']= lab.fit_transform(train['customer_ID'])


In [7]:
%%time
train = train.groupby(['customer_ID']).tail(1).set_index('customer_ID')

CPU times: user 1.9 s, sys: 2.39 s, total: 4.29 s
Wall time: 4.3 s


In [8]:
%%time
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")

CPU times: user 20.6 s, sys: 17 s, total: 37.6 s
Wall time: 42.3 s


In [9]:
test['customer_ID']= lab.fit_transform(test['customer_ID'])
test = test.groupby(['customer_ID']).tail(1).set_index('customer_ID')

In [10]:
y = train.target
X = train.drop(["target","S_2"],axis=1)
test = test.drop(['S_2'],axis=1)

X = X.fillna(-1)
test = test.fillna(-1)

In [11]:
cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 
            'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']

num_cols = [col for col in X.columns if col not in cat_cols]

all_cols = [cat_cols,num_cols]

In [12]:
D_n_cols = [col for col in num_cols if col.startswith("D")]
S_n_cols = [col for col in num_cols if col.startswith("S")]
P_n_cols = [col for col in num_cols if col.startswith("P")]
B_n_cols = [col for col in num_cols if col.startswith("B")]
R_n_cols = [col for col in num_cols if col.startswith("R")]
D_c_cols = [col for col in cat_cols if col.startswith("D")]
B_c_cols = [col for col in cat_cols if col.startswith("B")] 

In [13]:
%%time 
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]

X_num_agg_S = X.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
X_num_agg_S.columns = ['_'.join(x) for x in X_num_agg_S.columns]

X_num_agg_P = X.groupby("customer_ID")[P_n_cols].agg(['mean','min','max' ,'last'])
X_num_agg_P.columns = ['_'.join(x) for x in X_num_agg_P.columns]

X_num_agg_B = X.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
X_num_agg_B.columns = ['_'.join(x) for x in X_num_agg_B.columns]

X_num_agg_R = X.groupby("customer_ID")[R_n_cols].agg(['mean','min','last'])
X_num_agg_R.columns = ['_'.join(x) for x in X_num_agg_R.columns]

X_cat_agg_D = X.groupby("customer_ID")[D_c_cols].agg([ 'count','last','first','nunique'])
X_cat_agg_D.columns = ['_'.join(x) for x in X_cat_agg_D.columns]

X_cat_agg_B = X.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
X_cat_agg_B.columns = ['_'.join(x) for x in X_cat_agg_B.columns]

X = pd.concat([X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B], axis=1)
del X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B
_ = gc.collect()

print('X shape after engineering', X.shape)

X shape after engineering (458913, 576)
CPU times: user 9.04 s, sys: 1.03 s, total: 10.1 s
Wall time: 10.1 s


In [14]:
%%time 
test_num_agg_D = test.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
test_num_agg_D.columns = ['_'.join(x) for x in test_num_agg_D.columns]

test_num_agg_S = test.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
test_num_agg_S.columns = ['_'.join(x) for x in test_num_agg_S.columns]

test_num_agg_P = test.groupby("customer_ID")[P_n_cols].agg(['mean','min','max', 'last'])
test_num_agg_P.columns = ['_'.join(x) for x in test_num_agg_P.columns]

test_num_agg_B = test.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
test_num_agg_B.columns = ['_'.join(x) for x in test_num_agg_B.columns]

test_num_agg_R = test.groupby("customer_ID")[R_n_cols].agg(['mean','min', 'last'])
test_num_agg_R.columns = ['_'.join(x) for x in test_num_agg_R.columns]

test_cat_agg_D = test.groupby("customer_ID")[D_c_cols].agg(['count','first', 'last','nunique'])
test_cat_agg_D.columns = ['_'.join(x) for x in test_cat_agg_D.columns]

test_cat_agg_B = test.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
test_cat_agg_B.columns = ['_'.join(x) for x in test_cat_agg_B.columns]

test = pd.concat([test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B], axis=1)
del test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B
_ = gc.collect()

print('Test shape after engineering', test.shape)

Test shape after engineering (924621, 576)
CPU times: user 17.7 s, sys: 1.68 s, total: 19.3 s
Wall time: 19.4 s


In [15]:
X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.4,stratify=y)

In [16]:
my_model = LGBMClassifier(n_estimators=130)
my_model.fit(X_train, y_train)
train_score = my_model.score(X_train, y_train)
test_score = my_model.score(X_valid, y_valid)
print('train : ',train_score)
print('test : ',test_score)

train :  0.9077019179435404
test :  0.9000849830578648


In [17]:
pred_test = my_model.predict_proba(test)
preds = pd.DataFrame(pred_test)
pred_final = np.array(preds[1])
pred_final

array([0.02349888, 0.00155124, 0.04278203, ..., 0.52551257, 0.41041943,
       0.07891399])

In [18]:
submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
submission['prediction']=pred_final
submission

Unnamed: 0,customer_ID,prediction
0,00000469ba478561f23a92a868bd366de6f6527a684c9a...,0.023499
1,00001bf2e77ff879fab36aa4fac689b9ba411dae63ae39...,0.001551
2,0000210045da4f81e5f122c6bde5c2a617d03eef67f82c...,0.042782
3,00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976c...,0.366460
4,00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9...,0.911382
...,...,...
924616,ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c...,0.017429
924617,ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3...,0.730796
924618,ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475...,0.525513
924619,ffffddef1fc3643ea179c93245b68dca0f36941cd83977...,0.410419


In [19]:
submission.to_csv("submission.csv",index=False)