# Feature Selection

### Importing Dataset

In [89]:
import pandas as pd
import os
import plotly.express as px 
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.preprocessing import MinMaxScaler

### Importing and Initialising Dataset

In [90]:
#Initialising Dataset
os.chdir(r"/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input")
data = pd.read_csv("/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input/credit_card_churn_cleaned.csv") 

### Heatmap

In [91]:
corr_matrix = data.corr(numeric_only=True)
fig = px.imshow(
    corr_matrix,
    text_auto=".2f",
    color_continuous_scale='RdBu',
    title="Correlation Heatmap (Interactive)",
    aspect="auto"
)
fig.update_layout(
    width=800,
    height=800,
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickangle=-45, tickmode='array', tickvals=list(range(len(corr_matrix.columns))), ticktext=corr_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(corr_matrix.index))), ticktext=corr_matrix.index)
)
fig.show()

In [92]:
import numpy as np

# BEGIN: Masking the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Create a masked correlation matrix
masked_corr_matrix = corr_matrix.mask(mask)

# Plotting the heatmap with the masked correlation matrix
fig = px.imshow(
    masked_corr_matrix,
    text_auto=".2f",
    color_continuous_scale='RdBu',
    title="Correlation Heatmap (Interactive) - Lower Triangle",
    aspect="auto"
)
fig.update_layout(
    width=800,
    height=800,
    xaxis_title="Features",
    yaxis_title="Features",
    xaxis=dict(tickangle=-45, tickmode='array', tickvals=list(range(len(masked_corr_matrix.columns))), ticktext=masked_corr_matrix.columns),
    yaxis=dict(tickmode='array', tickvals=list(range(len(masked_corr_matrix.index))), ticktext=masked_corr_matrix.index)
)
fig.show()
# END: Masking the upper triangle

In [93]:
# Extracting correlation values with respect to Attrition_Flag
attrition_corr = corr_matrix['Attrition_Flag'].sort_values(ascending=False)
print(attrition_corr)

Attrition_Flag                          1.000000
Total_Trans_Amt_Binned_Low              0.250708
Total_Revolving_Bal_Binned_Low          0.208037
Contacts_Count_12_mon                   0.204491
Months_Inactive_12_mon                  0.152449
Total_Trans_Amt_Binned_High             0.055048
Gender                                  0.037272
Avg_Open_To_Buy_Binned_Low              0.035031
Single                                  0.019037
Dependent_count                         0.018991
Age_40-50                               0.017057
Months_on_book                          0.013687
Age_50-60                               0.011321
Unknown                                 0.008904
Education_Level                         0.008796
Card_Category                           0.002354
Avg_Open_To_Buy_Binned_Very High        0.000911
Divorced                                0.000850
Total_Amt_Chng_Q4_Q1                   -0.000285
Age_60+                                -0.009699
Income_Category     

### Dropping Features whose correlation value is approx. 0 w.r.t. Attrition Flag

In [94]:
data

Unnamed: 0,CLIENTNUM,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,...,Total_Trans_Amt_Binned_High,Total_Trans_Amt_Binned_Very High,Avg_Open_To_Buy_Binned_Low,Avg_Open_To_Buy_Binned_Medium,Avg_Open_To_Buy_Binned_High,Avg_Open_To_Buy_Binned_Very High,Total_Revolving_Bal_Binned_Low,Total_Revolving_Bal_Binned_Medium,Total_Revolving_Bal_Binned_High,Total_Revolving_Bal_Binned_Very High
0,768805383,0,0,3,1,3,1,39,5,1,...,False,False,False,True,False,False,False,True,False,False
1,818770008,0,1,5,3,1,1,44,6,1,...,False,False,False,True,False,False,False,True,False,False
2,713982108,0,0,3,3,4,1,36,4,1,...,False,False,True,False,False,False,False,False,False,False
3,769911858,0,1,4,1,1,1,34,3,4,...,False,False,True,False,False,False,False,False,True,False
4,709106358,0,0,3,0,3,1,21,5,1,...,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,0,0,2,3,2,1,40,3,2,...,False,True,True,False,False,False,False,False,True,False
10123,710638233,1,0,2,-1,2,1,25,4,2,...,True,False,True,False,False,False,False,False,True,False
10124,716506083,1,1,1,1,1,1,36,5,3,...,False,True,False,True,False,False,False,False,False,False
10125,717406983,1,0,2,3,2,1,36,4,3,...,True,False,False,True,False,False,False,False,False,False


In [95]:
data = data.drop(columns=['Avg_Open_To_Buy_Binned_Very High', 'Divorced', 'Total_Amt_Chng_Q4_Q1' ])

In [96]:
data

Unnamed: 0,CLIENTNUM,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,...,Total_Trans_Amt_Binned_Medium,Total_Trans_Amt_Binned_High,Total_Trans_Amt_Binned_Very High,Avg_Open_To_Buy_Binned_Low,Avg_Open_To_Buy_Binned_Medium,Avg_Open_To_Buy_Binned_High,Total_Revolving_Bal_Binned_Low,Total_Revolving_Bal_Binned_Medium,Total_Revolving_Bal_Binned_High,Total_Revolving_Bal_Binned_Very High
0,768805383,0,0,3,1,3,1,39,5,1,...,True,False,False,False,True,False,False,True,False,False
1,818770008,0,1,5,3,1,1,44,6,1,...,True,False,False,False,True,False,False,True,False,False
2,713982108,0,0,3,3,4,1,36,4,1,...,True,False,False,True,False,False,False,False,False,False
3,769911858,0,1,4,1,1,1,34,3,4,...,True,False,False,True,False,False,False,False,True,False
4,709106358,0,0,3,0,3,1,21,5,1,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,0,0,2,3,2,1,40,3,2,...,False,False,True,True,False,False,False,False,True,False
10123,710638233,1,0,2,-1,2,1,25,4,2,...,False,True,False,True,False,False,False,False,True,False
10124,716506083,1,1,1,1,1,1,36,5,3,...,False,False,True,False,True,False,False,False,False,False
10125,717406983,1,0,2,3,2,1,36,4,3,...,False,True,False,False,True,False,False,False,False,False


### Dropping CLIENTNUM as it is a unique ID given to each Customer

In [97]:
data

Unnamed: 0,CLIENTNUM,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,...,Total_Trans_Amt_Binned_Medium,Total_Trans_Amt_Binned_High,Total_Trans_Amt_Binned_Very High,Avg_Open_To_Buy_Binned_Low,Avg_Open_To_Buy_Binned_Medium,Avg_Open_To_Buy_Binned_High,Total_Revolving_Bal_Binned_Low,Total_Revolving_Bal_Binned_Medium,Total_Revolving_Bal_Binned_High,Total_Revolving_Bal_Binned_Very High
0,768805383,0,0,3,1,3,1,39,5,1,...,True,False,False,False,True,False,False,True,False,False
1,818770008,0,1,5,3,1,1,44,6,1,...,True,False,False,False,True,False,False,True,False,False
2,713982108,0,0,3,3,4,1,36,4,1,...,True,False,False,True,False,False,False,False,False,False
3,769911858,0,1,4,1,1,1,34,3,4,...,True,False,False,True,False,False,False,False,True,False
4,709106358,0,0,3,0,3,1,21,5,1,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,772366833,0,0,2,3,2,1,40,3,2,...,False,False,True,True,False,False,False,False,True,False
10123,710638233,1,0,2,-1,2,1,25,4,2,...,False,True,False,True,False,False,False,False,True,False
10124,716506083,1,1,1,1,1,1,36,5,3,...,False,False,True,False,True,False,False,False,False,False
10125,717406983,1,0,2,3,2,1,36,4,3,...,False,True,False,False,True,False,False,False,False,False


In [98]:
data = data.drop(columns='CLIENTNUM')

In [99]:
data

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,...,Total_Trans_Amt_Binned_Medium,Total_Trans_Amt_Binned_High,Total_Trans_Amt_Binned_Very High,Avg_Open_To_Buy_Binned_Low,Avg_Open_To_Buy_Binned_Medium,Avg_Open_To_Buy_Binned_High,Total_Revolving_Bal_Binned_Low,Total_Revolving_Bal_Binned_Medium,Total_Revolving_Bal_Binned_High,Total_Revolving_Bal_Binned_Very High
0,0,0,3,1,3,1,39,5,1,3,...,True,False,False,False,True,False,False,True,False,False
1,0,1,5,3,1,1,44,6,1,2,...,True,False,False,False,True,False,False,True,False,False
2,0,0,3,3,4,1,36,4,1,0,...,True,False,False,True,False,False,False,False,False,False
3,0,1,4,1,1,1,34,3,4,1,...,True,False,False,True,False,False,False,False,True,False
4,0,0,3,0,3,1,21,5,1,0,...,False,False,False,True,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10122,0,0,2,3,2,1,40,3,2,3,...,False,False,True,True,False,False,False,False,True,False
10123,1,0,2,-1,2,1,25,4,2,3,...,False,True,False,True,False,False,False,False,True,False
10124,1,1,1,1,1,1,36,5,3,4,...,False,False,True,False,True,False,False,False,False,False
10125,1,0,2,3,2,1,36,4,3,3,...,False,True,False,False,True,False,False,False,False,False


## Saving the modified dataset to a new CSV file in the input folder

In [100]:
output_file_cleaned = 'credit_card_churn_cleaned_feature_selected.csv'
input_directory = r"/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input"
data.to_csv(os.path.join(input_directory, output_file_cleaned), index=False)

### Checking and Handling Outliners using Box-Map

In [101]:
cols = ['Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1']
fig = make_subplots(rows=2, cols=2, start_cell="bottom-left", subplot_titles=cols)

fig.add_trace(go.Box(y=data[cols[0]], name=cols[0], boxpoints='outliers'),
              row=2, col=1)

fig.add_trace(go.Box(y=data[cols[1]], name=cols[1], boxpoints='outliers'),
              row=2, col=2)

fig.add_trace(go.Box(y=data[cols[2]], name=cols[2], boxpoints='outliers'),
              row=1, col=1)

fig.add_trace(go.Box(y=data[cols[3]], name=cols[3], boxpoints='outliers'),
              row=1, col=2)

# Update layout
fig.update_layout(height=600, width=900, title_text="Box Plots of Selected Features")
fig.show()


In [102]:
data.describe()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0,10127.0
mean,0.16066,0.529081,2.346203,1.601955,1.975906,1.083638,35.928409,3.81258,2.341167,2.455317,8631.953698,64.858695,0.759941,0.712222
std,0.367235,0.499178,1.298908,1.700417,1.652395,0.333784,7.986416,1.554408,1.010622,1.106225,9088.77665,23.47257,0.219207,0.238086
min,0.0,0.0,0.0,-1.0,-1.0,1.0,13.0,1.0,0.0,0.0,1438.3,10.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,1.0,31.0,3.0,2.0,2.0,2555.0,45.0,0.631,0.582
50%,0.0,1.0,2.0,2.0,2.0,1.0,36.0,4.0,2.0,2.0,4549.0,67.0,0.736,0.702
75%,0.0,1.0,3.0,3.0,3.0,1.0,40.0,5.0,3.0,3.0,11067.5,81.0,0.859,0.818
max,1.0,1.0,5.0,5.0,5.0,4.0,56.0,6.0,6.0,6.0,34516.0,139.0,3.397,3.714


In [103]:
def drop_outliers_iqr(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    # Filter and return new DataFrame
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return filtered_df


In [104]:
cols_to_drop_outliners = ['Credit_Limit', 'Avg_Utilization_Ratio', 'Total_Trans_Ct', 'Total_Ct_Chng_Q4_Q1']
# data = drop_outliers_iqr(data, 'Credit_Limit')  
# data = drop_outliers_iqr(data, 'Avg_Utilization_Ratio')
# data = drop_outliers_iqr(data, 'Total_Trans_Ct')
# data = drop_outliers_iqr(data, 'Total_Ct_Chng_Q4_Q1')
data = drop_outliers_iqr(data, cols_to_drop_outliners[0])   

In [105]:
data.describe()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0
mean,0.160669,0.574647,2.320354,1.601006,1.799081,1.033468,35.922017,3.847206,2.344526,2.449852,6165.29335,64.393634,0.759966,0.71281
std,0.367246,0.494423,1.301963,1.700132,1.542369,0.20435,8.105905,1.540298,1.015346,1.10471,5229.540708,23.010827,0.220474,0.238621
min,0.0,0.0,0.0,-1.0,-1.0,1.0,13.0,1.0,0.0,0.0,1438.3,10.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,1.0,31.0,3.0,2.0,2.0,2444.5,45.0,0.63,0.581
50%,0.0,1.0,2.0,2.0,2.0,1.0,36.0,4.0,2.0,2.0,3908.0,67.0,0.736,0.702
75%,0.0,1.0,3.0,3.0,3.0,1.0,40.0,5.0,3.0,3.0,8376.5,80.0,0.859,0.821
max,1.0,1.0,5.0,5.0,5.0,4.0,56.0,6.0,6.0,6.0,23760.0,138.0,3.397,3.714


### Scaling

In [106]:
data.describe()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0
mean,0.160669,0.574647,2.320354,1.601006,1.799081,1.033468,35.922017,3.847206,2.344526,2.449852,6165.29335,64.393634,0.759966,0.71281
std,0.367246,0.494423,1.301963,1.700132,1.542369,0.20435,8.105905,1.540298,1.015346,1.10471,5229.540708,23.010827,0.220474,0.238621
min,0.0,0.0,0.0,-1.0,-1.0,1.0,13.0,1.0,0.0,0.0,1438.3,10.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,1.0,1.0,31.0,3.0,2.0,2.0,2444.5,45.0,0.63,0.581
50%,0.0,1.0,2.0,2.0,2.0,1.0,36.0,4.0,2.0,2.0,3908.0,67.0,0.736,0.702
75%,0.0,1.0,3.0,3.0,3.0,1.0,40.0,5.0,3.0,3.0,8376.5,80.0,0.859,0.821
max,1.0,1.0,5.0,5.0,5.0,4.0,56.0,6.0,6.0,6.0,23760.0,138.0,3.397,3.714


In [107]:
def apply_Min_Max_scaling(data, columns):
    scaler = MinMaxScaler()
    data[columns] = scaler.fit_transform(data[columns])
    return data


In [108]:
cols_to_scale = [	'Dependent_count',	'Education_Level',	'Income_Category',	'Card_Category',	'Months_on_book',	'Total_Relationship_Count',	'Months_Inactive_12_mon',	'Contacts_Count_12_mon',	'Credit_Limit',	'Total_Trans_Ct',	'Total_Ct_Chng_Q4_Q1',	'Avg_Utilization_Ratio']
data = apply_Min_Max_scaling(data, cols_to_scale)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [109]:
data.describe()

Unnamed: 0,Attrition_Flag,Gender,Dependent_count,Education_Level,Income_Category,Card_Category,Months_on_book,Total_Relationship_Count,Months_Inactive_12_mon,Contacts_Count_12_mon,Credit_Limit,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio
count,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0,9143.0
mean,0.160669,0.574647,0.464071,0.433501,0.466514,0.011156,0.53307,0.569441,0.390754,0.408309,0.211767,0.42495,0.223717,0.191925
std,0.367246,0.494423,0.260393,0.283355,0.257061,0.068117,0.188509,0.30806,0.169224,0.184118,0.234281,0.179772,0.064903,0.064249
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.2,0.166667,0.333333,0.0,0.418605,0.4,0.333333,0.333333,0.045077,0.273438,0.185458,0.156435
50%,0.0,1.0,0.4,0.5,0.5,0.0,0.534884,0.6,0.333333,0.333333,0.110641,0.445312,0.216662,0.189015
75%,0.0,1.0,0.6,0.666667,0.666667,0.0,0.627907,0.8,0.5,0.5,0.310828,0.546875,0.25287,0.221055
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


### Saving Scaled and Outliners handles data to new CSV File

In [110]:
# Saving the modified dataset to a new CSV file in the input folder
output_file_cleaned = 'credit_card_churn_cleaned_feature_selected_scaled.csv'
input_directory = r"/Users/apple/Documents/Credit_Card_Churn_Model_Local/credit-card-churn/Input"
data.to_csv(os.path.join(input_directory, output_file_cleaned), index=False)