In [1]:
import numpy as np
import pandas as pd
from statistics import mean 

import seaborn as sns
import scikitplot as skplt
import matplotlib.pyplot as plt


from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV, GridSearchCV

In [2]:
df=pd.read_csv("../data/card_transactions.csv")
df.head()

Unnamed: 0,customer_id,balance_account,purchases_amount,paid_advance,full_purchases,balance_frequency_update,installments,purchases_frequency,full_purchases_frequency,purchases_installments_frequency,cash_advance_frequency,nr_cash_advances,nr_purchases,credit_limit,fixed_rate_period,payments,min_payments,full_payment_prc
0,tGEg8fT2eCgVxdwS,9.298409,107.6,0.0,0.0,0.909091,107.6,0.909091,0.0,0.818182,0.0,0,10,1000.0,11,89.868853,89.711672,0.428571
1,stEDu20sSKOXomjg,52.666422,150.3,0.0,0.0,1.0,150.3,1.0,0.0,0.833333,0.0,0,6,1000.0,6,54.722514,82.95518,0.0
2,CEB2blrN3b88GsRe,1360.795674,0.0,462.861443,0.0,1.0,0.0,0.0,0.0,0.0,0.272727,6,0,1500.0,11,303.031979,443.781316,0.0
3,0N0eBl8ySaZThxKE,2021.641161,363.18,0.0,363.18,1.0,0.0,1.0,1.0,0.0,0.0,0,13,2000.0,12,704.24077,795.859808,0.0
4,6NLauoHUwtwOv4Wy,22.322374,95.0,1162.045833,95.0,0.363636,0.0,0.083333,0.083333,0.0,0.166667,3,1,2500.0,12,4094.261427,40.142262,0.0


In [3]:
print(df.shape,len(df["customer_id"].unique()),df.dtypes,df.isna().sum())

(8500, 18) 8500 customer_id                          object
balance_account                     float64
purchases_amount                    float64
paid_advance                        float64
full_purchases                      float64
balance_frequency_update            float64
installments                        float64
purchases_frequency                 float64
full_purchases_frequency            float64
purchases_installments_frequency    float64
cash_advance_frequency              float64
nr_cash_advances                      int64
nr_purchases                          int64
credit_limit                        float64
fixed_rate_period                     int64
payments                            float64
min_payments                        float64
full_payment_prc                    float64
dtype: object customer_id                           0
balance_account                       0
purchases_amount                      0
paid_advance                          0
full_purchases    

Just some overall stats: 8500 customers, with 18 features each, indeed no duplicates, aka unique customer_id length is the same than the overall dataframe dimensions, min payment has quite a couple of NAs, so let's deal with that via imputing data maybe later on.

In [4]:
#sns.heatmap(df.drop(columns=["customer_id"]).corr(),anotate=True)
df.drop(columns=["customer_id"]).corr().style.background_gradient(cmap='coolwarm').set_precision(2)

Unnamed: 0,balance_account,purchases_amount,paid_advance,full_purchases,balance_frequency_update,installments,purchases_frequency,full_purchases_frequency,purchases_installments_frequency,cash_advance_frequency,nr_cash_advances,nr_purchases,credit_limit,fixed_rate_period,payments,min_payments,full_payment_prc
balance_account,1.0,0.18,0.49,0.17,0.32,0.13,-0.08,0.07,-0.06,0.45,0.38,0.16,0.54,0.07,0.32,0.4,-0.32
purchases_amount,0.18,1.0,-0.05,0.92,0.13,0.68,0.39,0.49,0.31,-0.12,-0.07,0.69,0.36,0.09,0.61,0.09,0.18
paid_advance,0.49,-0.05,1.0,-0.03,0.1,-0.06,-0.21,-0.09,-0.18,0.63,0.66,-0.07,0.31,-0.07,0.45,0.14,-0.15
full_purchases,0.17,0.92,-0.03,1.0,0.1,0.34,0.26,0.52,0.13,-0.08,-0.05,0.54,0.32,0.06,0.58,0.05,0.13
balance_frequency_update,0.32,0.13,0.1,0.1,1.0,0.12,0.23,0.2,0.18,0.19,0.14,0.19,0.1,0.12,0.06,0.13,-0.09
installments,0.13,0.68,-0.06,0.34,0.12,1.0,0.44,0.22,0.51,-0.13,-0.07,0.63,0.26,0.09,0.39,0.13,0.18
purchases_frequency,-0.08,0.39,-0.21,0.26,0.23,0.44,1.0,0.5,0.86,-0.31,-0.2,0.57,0.12,0.06,0.1,0.0,0.3
full_purchases_frequency,0.07,0.49,-0.09,0.52,0.2,0.22,0.5,1.0,0.14,-0.11,-0.07,0.54,0.29,0.08,0.24,-0.03,0.16
purchases_installments_frequency,-0.06,0.31,-0.18,0.13,0.18,0.51,0.86,0.14,1.0,-0.26,-0.17,0.53,0.06,0.07,0.09,0.03,0.25
cash_advance_frequency,0.45,-0.12,0.63,-0.08,0.19,-0.13,-0.31,-0.11,-0.26,1.0,0.8,-0.13,0.14,-0.13,0.18,0.1,-0.25


In [None]:
sns.pairplot(df)

In [None]:
sc = StandardScaler()


df_scaled= pd.DataFrame(sc.fit_transform(df.drop(columns=["customer_id"])), columns = df.drop(columns=["customer_id"]).columns)
plt.figure(figsize=(16,14))
sns.boxplot(data = df_scaled)
plt.xticks(rotation=65, horizontalalignment='right')

# KNN Imputation to remove NAs

In [7]:
import pandas as pd
from sklearn.impute import KNNImputer


In [6]:
### Numerical columns
num_feat = df.iloc[:, list(np.where(df.dtypes != "object")[0])]
num_col_name = df.iloc[:, list(np.where(df.dtypes != "object")[0])].columns

### KNN Imputer
imputer = KNNImputer(n_neighbors=5, weights="distance")
imputed = imputer.fit_transform(num_feat)

### Recreate Dataframe
df_no_na = pd.DataFrame(imputed, columns=num_col_name)
df_result = pd.concat([pd.DataFrame(df.iloc[:,0]), df_no_na], axis=1, sort=False)

df_result.isna().sum().sum()

0