### Provide by

นายธนชาติ เสถียรจารุการ 63340500021 <br>

นายพชพล เพชรรัตน์ 63340500036

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.utils import resample
from sklearn.preprocessing import QuantileTransformer
import math

RandomState = 1
df = pd.read_csv('credit_card_churn.csv')
df = df.drop(['Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_1', 'Naive_Bayes_Classifier_Attrition_Flag_Card_Category_Contacts_Count_12_mon_Dependent_count_Education_Level_Months_Inactive_12_mon_2', 'CLIENTNUM'], axis=1)
df = df.rename(columns={'Attrition_Flag': 'y',
                         'Customer_Age': 'age',
                         'Gender': 'gender',
                         'Dependent_count': 'dependency',
                         'Education_Level': 'education',
                         'Marital_Status': 'marital',
                         'Income_Category': 'income',
                         'Card_Category': 'card',
                         'Months_on_book': 'book_period',
                         'Total_Relationship_Count': 'total_product',
                         'Months_Inactive_12_mon': 'month_inactive',
                         'Contacts_Count_12_mon': 'contact_num',
                         'Credit_Limit': 'credit_limit',
                         'Total_Revolving_Bal': 'revolving_balance',
                         'Avg_Open_To_Buy': 'open2buy',
                         'Total_Amt_Chng_Q4_Q1': 'transaction_change',
                         'Total_Trans_Amt': 'transaction_amount',
                         'Total_Trans_Ct': 'transaction_count',
                         'Total_Ct_Chng_Q4_Q1': 'transaction_count_change',
                         'Avg_Utilization_Ratio': 'utilization_ratio',
                         }
)
df['y'] = df['y'].replace(['Attrited Customer', 'Existing Customer'], [0, 1])

### Data exploration

In [None]:
df.info()

##### Target
The total number of data is 10,127 samples devided into
- Attrited: 8,500 samples
<br>

- Existing: 1,627 samples

**the dataset is imbalanced**

In [None]:
count = df['y'].value_counts(); print(count)
plt.figure(figsize = (4, 3))
sns.histplot(data = df, x = 'y')
plt.show()
print(f"major target: {(count[1]/(count[0] + count[1])) * 100:0.2f}%")

##### resample data

In [None]:
major_target = df.loc[df['y'] == 1]
minor_target = df.loc[df['y'] == 0]
upsampling_df = resample(minor_target, n_samples=major_target.shape[0], replace=True, random_state=RandomState)
df = pd.concat([major_target, upsampling_df], ignore_index=True)
df = df.sample(frac = 1, ignore_index=True)

count = df['y'].value_counts(); print(count)
plt.figure(figsize = (4, 3))
sns.histplot(data = df, x = 'y')
plt.show()

print(f"major target: {(count[1]/(count[0] + count[1])) * 100:0.2f}%")

##### Explore numerical data

the data that seem to have an outlier is listed below
- age
- transaction_change
- transaction_count_change

In [None]:
def seperateDataType(df): ## return list of numerical and categorical data
    cols = df.columns
    num_data = [i for i in cols if (len(df[i].unique()) > 7 and df[i].dtype != 'object')]
    cat_data = list(set(cols) - set(num_data))
    return num_data, cat_data

num_data, cat_data = seperateDataType(df)
plot_num = 1
plt.figure(figsize = (15,27))
for i in num_data:
    ax = plt.subplot(7, 3, plot_num)
    sns.histplot(data=df, x=i, hue='y', kde=True)
    plot_num += 1
plt.show()


- Using IQR for remove outlier

In [None]:
def removeNumericalOutlier(df:pd.DataFrame, feature_list:list):
    new_df = df.copy()
    for feature in feature_list:       
        q1 = df[feature].quantile(0.25)
        q3 = df[feature].quantile(0.75)
        IQR = q3 - q1
        lower_bound = q1 - 1.5*IQR
        upper_bound = q3 + 1.5*IQR
        new_df = new_df[(new_df[feature]>lower_bound)&(new_df[feature]<upper_bound)]
    return new_df

outlier_list = ['age', 'transaction_change', 'transaction_count_change']
df = removeNumericalOutlier(df, outlier_list)

plot_num = 1
plt.figure(figsize = (9, 4))
for i in outlier_list:
    ax = plt.subplot(1, 3, plot_num)
    sns.histplot(data=df, x=i, hue='y')
    plot_num += 1
plt.show()

- Using quantile transformer to correct skewed data and standardlize

In [None]:
def correctSkewed(df:pd.DataFrame, skewed_data_list:list):
    ##### correct skewed data
    quantile_transformer = QuantileTransformer(output_distribution='normal', random_state=1)
    x_skew = df[skewed_data_list].values
    X_trans = quantile_transformer.fit_transform(x_skew)
    df[skewed_data_list] = X_trans
    return df

df = correctSkewed(df, num_data)
plot_num = 1
plt.figure(figsize = (15,27))
for i in num_data:
    ax = plt.subplot(7, 3, plot_num)
    sns.histplot(data=df, x=i, hue='y', kde=True)
    plot_num += 1
plt.show()

- plot heatmap of numerical data correlation

In [None]:
num_df = df[num_data]
num_df['y'] = df['y']
plt.figure(figsize = (10,7))
sns.heatmap(round(num_df.corr(), 2), annot=True)
plt.show()

- visualize high correlation to target

In [None]:
high_corr_data_list = ['transaction_count', 'transaction_count_change', 'utilization_ratio', 'transaction_amount', 'revolving_balance']
plot_num = 1
plt.figure(figsize = (15,27))
for i in high_corr_data_list:
    ax = plt.subplot(7, 3, plot_num)
    sns.boxplot(data=num_df, x='y', y=i)
    plot_num += 1
plt.show()

- visualize relation between numerical data

In [None]:
sns.pairplot(data=num_df, hue='y')
plt.show()

analyze: 

##### Explore categorical data

In [None]:
plot_num = 1
plt.figure(figsize = (15,27))
for i in cat_data:
    ax = plt.subplot(7, 3, plot_num)
    sns.histplot(data=df, x=i, hue='y')
    plot_num += 1
plt.show()

- group card category 

In [None]:
df['card'] = df['card'].replace(['Silver', 'Gold', 'Platinum'], ['not_blue']*3)
plt.figure(figsize = (3,4))
sns.histplot(data=df, x='card', hue='y')
plt.show()

- plot heatmap of categorial data correlation

In [None]:
cat_df = df[cat_data]
cat_df = cat_df.drop(['y'],axis=1)
cat_df = pd.get_dummies(cat_df)
cat_df['y'] = df['y']
plt.figure(figsize = (32,18))
sns.heatmap(round(cat_df.corr(), 2), annot=True)
plt.show()

- visualize high correlation to target

In [None]:
high_corr_data_list = ['contact_num', 'month_inactive', 'total_product']
plot_num = 1
plt.figure(figsize = (15,27))
for i in high_corr_data_list:
    ax = plt.subplot(7, 3, plot_num)
    sns.countplot(data=cat_df, x=i, hue='y')
    plot_num += 1
plt.show()

- visualize relation between categorical data

In [None]:
plot_num = 1
plt.figure(figsize = (60,60))
for i in cat_data:
    for j in cat_data:
        ax = plt.subplot(10, 10, plot_num)
        sns.countplot(data=df, x=i, hue=j)
        plot_num += 1
plt.show()

##### visualize relation between categoriacal and numerical data

In [None]:
plot_num = 1
plt.figure(figsize = (60,60))
for i in cat_data:
    for j in num_data:
        ax = plt.subplot(10, 10, plot_num)
        sns.boxplot(data=df, x=j, y=i ,hue='y')
        plot_num += 1
plt.show()