In [3]:
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
!pip install catboost

In [5]:
from catboost import CatBoostClassifier

In [6]:
import warnings, gc
warnings.filterwarnings("ignore")

In [8]:
%%time
train = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/train.parquet")
label = pd.read_csv("../input/amex-default-prediction/train_labels.csv")
train = train.merge(label,how='inner',on="customer_ID")

In [9]:
lab = LabelEncoder()
train['customer_ID']= lab.fit_transform(train['customer_ID'])


In [10]:
%%time
train = train.groupby(['customer_ID']).tail(1).set_index('customer_ID')

In [11]:
%%time
test = pd.read_parquet("../input/amex-data-integer-dtypes-parquet-format/test.parquet")

In [12]:
test['customer_ID']= lab.fit_transform(test['customer_ID'])
test = test.groupby(['customer_ID']).tail(1).set_index('customer_ID')

In [13]:
y = train.target
X = train.drop(["target","S_2"],axis=1)
test = test.drop(['S_2'],axis=1)

X = X.fillna(-1)
test = test.fillna(-1)

In [14]:
cat_cols = ['B_30', 'B_38', 'D_63', 'D_64', 'D_66', 
            'D_68', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126']

num_cols = [col for col in X.columns if col not in cat_cols]

all_cols = [cat_cols,num_cols]

In [15]:
D_n_cols = [col for col in num_cols if col.startswith("D")]
S_n_cols = [col for col in num_cols if col.startswith("S")]
P_n_cols = [col for col in num_cols if col.startswith("P")]
B_n_cols = [col for col in num_cols if col.startswith("B")]
R_n_cols = [col for col in num_cols if col.startswith("R")]
D_c_cols = [col for col in cat_cols if col.startswith("D")]
B_c_cols = [col for col in cat_cols if col.startswith("B")] 

In [16]:
%%time 
X_num_agg_D = X.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
X_num_agg_D.columns = ['_'.join(x) for x in X_num_agg_D.columns]

X_num_agg_S = X.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
X_num_agg_S.columns = ['_'.join(x) for x in X_num_agg_S.columns]

X_num_agg_P = X.groupby("customer_ID")[P_n_cols].agg(['mean','min','max' ,'last'])
X_num_agg_P.columns = ['_'.join(x) for x in X_num_agg_P.columns]

X_num_agg_B = X.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
X_num_agg_B.columns = ['_'.join(x) for x in X_num_agg_B.columns]

X_num_agg_R = X.groupby("customer_ID")[R_n_cols].agg(['mean','min','last'])
X_num_agg_R.columns = ['_'.join(x) for x in X_num_agg_R.columns]

X_cat_agg_D = X.groupby("customer_ID")[D_c_cols].agg([ 'count','last','first','nunique'])
X_cat_agg_D.columns = ['_'.join(x) for x in X_cat_agg_D.columns]

X_cat_agg_B = X.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
X_cat_agg_B.columns = ['_'.join(x) for x in X_cat_agg_B.columns]

X = pd.concat([X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B], axis=1)
del X_num_agg_D, X_num_agg_S,X_num_agg_P,X_num_agg_B,X_num_agg_R,X_cat_agg_D,X_cat_agg_B
_ = gc.collect()

print('X shape after engineering', X.shape)

In [17]:
%%time 
test_num_agg_D = test.groupby("customer_ID")[D_n_cols].agg(['mean','min', 'last'])
test_num_agg_D.columns = ['_'.join(x) for x in test_num_agg_D.columns]

test_num_agg_S = test.groupby("customer_ID")[S_n_cols].agg(['mean','min', 'last'])
test_num_agg_S.columns = ['_'.join(x) for x in test_num_agg_S.columns]

test_num_agg_P = test.groupby("customer_ID")[P_n_cols].agg(['mean','min','max', 'last'])
test_num_agg_P.columns = ['_'.join(x) for x in test_num_agg_P.columns]

test_num_agg_B = test.groupby("customer_ID")[B_n_cols].agg(['mean','min', 'last'])
test_num_agg_B.columns = ['_'.join(x) for x in test_num_agg_B.columns]

test_num_agg_R = test.groupby("customer_ID")[R_n_cols].agg(['mean','min', 'last'])
test_num_agg_R.columns = ['_'.join(x) for x in test_num_agg_R.columns]

test_cat_agg_D = test.groupby("customer_ID")[D_c_cols].agg(['count','first', 'last','nunique'])
test_cat_agg_D.columns = ['_'.join(x) for x in test_cat_agg_D.columns]

test_cat_agg_B = test.groupby("customer_ID")[B_c_cols].agg([ 'count','last','nunique'])
test_cat_agg_B.columns = ['_'.join(x) for x in test_cat_agg_B.columns]

test = pd.concat([test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B], axis=1)
del test_num_agg_D, test_num_agg_S,test_num_agg_P,test_num_agg_B,test_num_agg_R,test_cat_agg_D,test_cat_agg_B
_ = gc.collect()

print('Test shape after engineering', test.shape)

In [18]:
X_train,X_valid,y_train,y_valid = train_test_split(X, y, test_size=0.4,stratify=y)

In [20]:
my_model = CatBoostClassifier()
my_model.fit(X_train, y_train)
train_score = my_model.score(X_train, y_train)
test_score = my_model.score(X_valid, y_valid)
print('train : ',train_score)
print('test : ',test_score)

In [21]:
pred_test = my_model.predict_proba(test)
preds = pd.DataFrame(pred_test)
pred_final = np.array(preds[1])
pred_final

In [22]:
submission = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")
submission['prediction']=pred_final
submission

In [1]:
submission.to_csv("submission.csv",index=False)