In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet
/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv
/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr


# Import Libraries
<a id="import-libraries"></a>

In [2]:
# data preparation
import pandas as pd
import numpy as np

# visualization
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score  
from sklearn.metrics import precision_score                         
from sklearn.metrics import recall_score
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from lightgbm import LGBMClassifier, log_evaluation


# Use amexfeather Dataset
<a id="dataset"></a>

In [3]:
%%time
train_data = pd.read_feather('../input/amexfeather/train_data.ftr')
test_data = pd.read_feather('../input/amexfeather/test_data.ftr')
train=train_data.groupby('customer_ID').tail(1)
train=train.set_index(['customer_ID'])
# consider the most recent transaction for each customer out of the multiple transactions.
test=test_data.groupby('customer_ID').tail(1)
test=test.set_index(['customer_ID'])
del train_data
del test_data

CPU times: user 35.8 s, sys: 25 s, total: 1min
Wall time: 1min 7s


In [4]:
train.info(max_cols=200, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 458913 entries, 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a to fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eaba8b115f71cab04681
Data columns (total 190 columns):
 #    Column  Non-Null Count   Dtype         
---   ------  --------------   -----         
 0    S_2     458913 non-null  datetime64[ns]
 1    P_2     455944 non-null  float16       
 2    D_39    458913 non-null  float16       
 3    B_1     458913 non-null  float16       
 4    B_2     458882 non-null  float16       
 5    R_1     458913 non-null  float16       
 6    S_3     373943 non-null  float16       
 7    D_41    458882 non-null  float16       
 8    B_3     458882 non-null  float16       
 9    D_42    59910 non-null   float16       
 10   D_43    324591 non-null  float16       
 11   D_44    436618 non-null  float16       
 12   B_4     458913 non-null  float16       
 13   D_45    458882 non-null  float16       
 14   B_5     458913 non-null  float

# Check Missing Values
<a id="missing"></a>

In [5]:
null_percent = ((train.isnull().sum())/train.shape[0]).tolist()

## Removes features that have more than 50% missing values
<a id="50%-missing"></a>

In [6]:
null_list = []
for i in range(0,len(null_percent)):
    if null_percent[i]>=0.5:
        null_list.append(i)
null_list 
del null_percent

In [7]:
train_drop = train.drop(train.columns[null_list],axis=1)
test_drop = test.drop(test.columns[null_list],axis=1)
del train
del test
del null_list

In [8]:
train_drop.info(max_cols=200, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
Index: 458913 entries, 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a to fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eaba8b115f71cab04681
Data columns (total 161 columns):
 #    Column  Non-Null Count   Dtype         
---   ------  --------------   -----         
 0    S_2     458913 non-null  datetime64[ns]
 1    P_2     455944 non-null  float16       
 2    D_39    458913 non-null  float16       
 3    B_1     458913 non-null  float16       
 4    B_2     458882 non-null  float16       
 5    R_1     458913 non-null  float16       
 6    S_3     373943 non-null  float16       
 7    D_41    458882 non-null  float16       
 8    B_3     458882 non-null  float16       
 9    D_43    324591 non-null  float16       
 10   D_44    436618 non-null  float16       
 11   B_4     458913 non-null  float16       
 12   D_45    458882 non-null  float16       
 13   B_5     458913 non-null  float16       
 14   R_2     458913 non-null  float

# Preprocessing Data

In [9]:
df_drop = train_drop.drop(columns=["target",'S_2'], axis=1)
test_final = test_drop.drop(columns=['S_2'], axis=1)

In [10]:
y = train_drop["target"]
del train_drop
del test_drop

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df_drop, y, test_size = 0.1)

In [12]:
clf = LGBMClassifier(n_estimators=1200,
                          learning_rate=0.03, reg_lambda=50,
                          min_child_samples=2400,
                          num_leaves=95,
                          colsample_bytree=0.19,
                          max_bins=511, random_state=1)

In [13]:
clf.fit(X_train,y_train)

LGBMClassifier(colsample_bytree=0.19, learning_rate=0.03, max_bins=511,
               min_child_samples=2400, n_estimators=1200, num_leaves=95,
               random_state=1, reg_lambda=50)

In [14]:
y_predict=clf.predict(X_test)
print('LGBM Classifier Accuracy: {:.3f}'.format(accuracy_score(y_test, y_predict)))

LGBM Classifier Accuracy: 0.902


In [15]:
y_test_predict=clf.predict_proba(test_final)

In [16]:
a = pd.DataFrame({"prediction":y_predict})

In [17]:
a['prediction'].value_counts()

0    34173
1    11719
Name: prediction, dtype: int64

In [18]:
y_test_predict

array([[0.98716558, 0.01283442],
       [0.99834928, 0.00165072],
       [0.95941537, 0.04058463],
       ...,
       [0.46934449, 0.53065551],
       [0.8100769 , 0.1899231 ],
       [0.9577935 , 0.0422065 ]])

In [19]:
y_predict_final=y_test_predict[:,-1]

# Combine the predicted values and customer IDs into a submission DataFrame
submission = pd.DataFrame({"customer_ID":test_final.index,"prediction":y_predict_final})

submission.to_csv('submission.csv', index=False)