In [1]:
#Import libraries :
import pandas as pd
import numpy as np 

import matplotlib.pyplot as plt 
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from sklearn.preprocessing import StandardScaler, normalize

from sklearn.cluster import KMeans
from sklearn.mixture import GaussianMixture

from sklearn import metrics
from sklearn.metrics import silhouette_score

from sklearn.decomposition import PCA 
from sklearn.manifold import TSNE

import warnings
warnings.filterwarnings("ignore")

In [7]:
#Dataset link: https://drive.google.com/file/d/12v7GXfXbeVOzza-56ZWD7nfQFwr5UudT/view?usp=sharing
#Load data:
#!pip install google.colab
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

root_dir = "/content/gdrive/My Drive/"
base_dir = root_dir

#Data Load and Exploration
df = pd.read_csv("/content/gdrive/My Drive//CC GENERAL.csv")
#df.head()
df.drop("CUST_ID", axis=1, inplace=True)
#df.shape
df.info()
df.dtypes
df.describe()
df[['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES','ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE','PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY','PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY','CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS','MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']].nunique()
#df[['CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'TENURE']].nunique()

ModuleNotFoundError: No module named 'colab'

In [None]:
#Data Visualization -- Check the Variable Distribution

from scipy.stats import norm 

graph_by_variables = ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCHASES_FREQUENCY',
       'PURCHASES_INSTALLMENTS_FREQUENCY', 'CASH_ADVANCE_FREQUENCY',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']
plt.figure(figsize=(15,18))

for i in range(0,17):
    plt.subplot(6,3,i+1)
    sns.distplot(df[graph_by_variables[i]].dropna(),fit=norm)
    plt.title(graph_by_variables[i])

plt.tight_layout()
    

In [None]:
#Data Visualization -- Correlation of Variables
plt.figure(figsize=(9,7))
sns.heatmap(df.corr(),cmap='coolwarm')

plt.title('Correlation Matrix')

In [None]:
##Data Visualization -- Frequency Variables Distribution
ax = df[['BALANCE_FREQUENCY', 'PURCHASES_FREQUENCY',
         'ONEOFF_PURCHASES_FREQUENCY', 'PURCHASES_INSTALLMENTS_FREQUENCY',
         'CASH_ADVANCE_FREQUENCY','PRC_FULL_PAYMENT']].plot.kde(figsize=(12,9), bw_method=3) 

In [None]:
#Data Visualization -- Check OUtliers
plt.figure(figsize=(20,35))

for i in range(0,17):
    plt.subplot(6, 3, i+1)
    plt.boxplot(df[graph_by_variables[i]].dropna())
    #plt.boxplot(df[graph_by_variables[i]].dropna(), showfliers=False)
    plt.title(graph_by_variables[i])
    

In [None]:
#Data Processing -- Check NULL Values and filling of the row data using mean:
df.isnull().sum().sort_values(ascending=False).head()
df.MINIMUM_PAYMENTS  = df.MINIMUM_PAYMENTS.fillna(df.MINIMUM_PAYMENTS.mean()) 
df.CREDIT_LIMIT = df.CREDIT_LIMIT.fillna(df.CREDIT_LIMIT.mean()) 

#Make sure no more NULL values:
df.isnull().sum().sort_values(ascending=False).head(6)

In [None]:
df.columns= ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCH_FREQ',
       'PURCH_INST_FREQ', 'CASH_ADVANCE_FREQ',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']
scaler = StandardScaler()
df_std = scaler.fit_transform(df)

In [None]:
# Using KMeans to check the custumer number per cluster first set _cluster = 10:
n_clusters = 10
clustering = KMeans(n_clusters=n_clusters,
                    random_state=0
                   )
cluster_labels = clustering.fit_predict(df_std)

# plot cluster sizesz
plt.hist(cluster_labels, bins=range(n_clusters+1))
plt.title ('Customers per Cluster')
plt.xlabel('Cluster')
plt.ylabel('Customers')
plt.show()

In [None]:
#Also using Elbow method to see what should be the cluster number:
EM = []
cluster_list = range(1, 11)
for i in cluster_list :
    kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 40)
    kmeans.fit(df_std)
    EM.append(kmeans.inertia_)
plt.plot(cluster_list, EM)
plt.title('Elbow Method')
plt.xlabel('Clusters')
plt.ylabel('EM')
plt.show()

In [None]:
#Finally using the Silhoustte scores method to see what should be the clsuter number:
silhouette_scores = [] 

for n_cluster in range(4, 11):
    silhouette_scores.append( 
        silhouette_score(df_std, KMeans(n_clusters = n_cluster).fit_predict(df_std))) 
    
# Plotting a bar graph to compare the results 
k = [4, 5, 6,7,8,9,10] 
plt.bar(k, silhouette_scores) 
plt.xlabel('Number of clusters', fontsize = 10) 
plt.ylabel('Silhouette Score', fontsize = 10) 
plt.show()

In [None]:
#Seems n_clusters in the range of 4 to 8 can provide a good customer segmentation based on the project requirements.
kmeans4 = df_std.copy()
kmeans4_ = KMeans(n_clusters=4, random_state=40).fit(kmeans4)
kmeans8 = df_std.copy() 
kmeans8_  = KMeans(n_clusters=8, random_state=40).fit(kmeans8) 

In [None]:
# Next let's using Gaussian mixture models for clustering. 
siliuette_list_GMM = []

for cluster in range(4,10,2):
    for covariance_type in ['full', 'tied', 'diag', 'spherical']:
        gmm  = GaussianMixture(n_components = cluster,covariance_type = covariance_type, random_state = 40).fit_predict(df_std)
        sil_score = metrics.silhouette_score(df_std, gmm, metric='euclidean')
        siliuette_list_GMM.append((cluster, sil_score, covariance_type, len(set(gmm)) ) )
        
df_gmm = pd.DataFrame(siliuette_list_GMM, columns=['cluster', 'sil_score','covariance_type', 'number_of_clusters'])
df_gmm.sort_values('sil_score', ascending=False).tail()

In [None]:
kmeans_      = KMeans(n_clusters=8, random_state=40).fit_predict(df_std)
gmm_         = GaussianMixture(n_components=10, covariance_type='spherical', random_state=40).fit_predict(df_std)
kmeansSilhouette_Score        = metrics.silhouette_score(df_std, kmeans_, metric='euclidean')
GMM_Silhouette_Score          = metrics.silhouette_score(df_std, gmm_, metric='euclidean')
Clustering_Silhouette_Scores  = [ ['KMeans',kmeansSilhouette_Score ], ['GMM', GMM_Silhouette_Score]]
Clustering_Silhouette_Scores  = pd.DataFrame(Clustering_Silhouette_Scores, columns=['Clustering Method', 'Silhouette Score']) 
Clustering_Silhouette_Scores.sort_values(by='Silhouette Score', ascending= False)

In [None]:
# KMeans cluster method with n_clusters=8 will be used to check if PCA and t-SNE dimensionality reduction help to better clustering:

kmeans_ = KMeans(n_clusters=8, random_state=40).fit(df_std)

df['cluster'] = kmeans_.labels_

# Normalizing the Data 
df_nor = normalize(df_std) 

# View the new feature data's shape 
df_nor.shape 

In [None]:
#Three dimensionality-reduction methods PCA will be used to see how it affect the Kmeans models. 
# From the heat map and the correlations, n_components=3 will be used:

pca = PCA(n_components=3).fit(df_nor)
df_pca3 = pca.fit_transform(df_nor)
print("original shape:   ", df_nor.shape)
print("transformed shape:", df_pca3.shape)
df_pca3 = pd.DataFrame(df_pca3) 
df_pca3.columns = ['A1', 'A2', 'A3']
df_pca3.head()
print(pca.explained_variance_)
fig = px.scatter_3d(df_pca3, x=df_pca3['A1'], y= df_pca3['A2'], z=df_pca3['A3'], color=df['cluster']) 
fig.show()

In [None]:
#Now let's using two dimension reducdion via t-SNE:
tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
df_tsne2 = tsne.fit_transform(df_nor)

In [None]:
# Store results of T-SNE in a data frame
result =  pd.DataFrame(df_tsne2, columns=['TSNE%i' % i for i in range(2)])
plt.scatter(result['TSNE0'], result['TSNE1'],
            c  = KMeans(n_clusters=8, random_state=40).fit_predict(df_tsne2), cmap =None) 
plt.show()

In [None]:
#KMeans on the non reduced dataset:

kmeans8 = KMeans(n_clusters=8, random_state=40).fit_predict(df_std)
kmeansSilhouette_Score = metrics.silhouette_score(df_std, kmeans8, metric='euclidean')


In [None]:
# Will PCA help to get better KMeans clustering?
kmeans8_PCA  = KMeans(n_clusters=8, random_state=40).fit_predict(df_pca3)
kmeansSilhouette_Score_PCA = metrics.silhouette_score(df_pca3, kmeans8_PCA, metric='euclidean')


In [None]:
# Now let's see KMeans on the t-SNE reduced dataset:
kmeans8_tsne  = KMeans(n_clusters=8, random_state=40).fit_predict(df_tsne2)
kmeansSilhouette_Score_tsne = metrics.silhouette_score(df_pca3, kmeans8_tsne, metric='euclidean')

In [None]:
#Let's compare the results:
Clustering_Silhouette_Scores  = [ ['KMeans',kmeansSilhouette_Score ], ['KMeans_PCA', kmeansSilhouette_Score_PCA], ['KMeans_tSNE',kmeansSilhouette_Score_tsne]]
Clustering_Silhouette_Scores  = pd.DataFrame(Clustering_Silhouette_Scores, columns=['Clustering Method', 'Silhouette Score']) 
Clustering_Silhouette_Scores.sort_values(by='Silhouette Score', ascending= False)


In [None]:
#Seems KMeans and PCA combo yields the best modeling results:
kmeans8_PCA = KMeans(n_clusters=8, random_state=40).fit(df_pca3)
df['cluster'] = kmeans8_PCA.labels_

#final_df = df.sort_values(by=['cluster'], ascending=False)
#print(final_df)
col_list= ['BALANCE', 'BALANCE_FREQUENCY', 'PURCHASES',
       'ONEOFF_PURCHASES', 'INSTALLMENTS_PURCHASES', 'CASH_ADVANCE',
       'PURCHASES_FREQUENCY', 'ONEOFF_PURCH_FREQ',
       'PURCH_INST_FREQ', 'CASH_ADVANCE_FREQ',
       'CASH_ADVANCE_TRX', 'PURCHASES_TRX', 'CREDIT_LIMIT', 'PAYMENTS',
       'MINIMUM_PAYMENTS', 'PRC_FULL_PAYMENT', 'TENURE']


for column in col_list:
    plt.figure(figsize=(15,3))
    for i in range(0,8):
        plt.subplot(1,8,i+1)
        cluster = df[df['cluster']==i]
        cluster[column].hist()
        plt.title('{} \n{}'.format(column, i))
        
    plt.tight_layout()
    plt.show()


In [None]:
#Chose the best features to see the modeling results:
best_columns = ["BALANCE", "PURCHASES", "CASH_ADVANCE","CREDIT_LIMIT", "PAYMENTS", "MINIMUM_PAYMENTS", "TENURE"]

best_columns.append("cluster")
plt.figure(figsize=(25,25))
sns.pairplot( df[best_columns], hue="cluster")

Conclusion:
After executing different methods, I chose the Kmeans Model as data does not include different small groups but very similar in each other. Therefore, I did not prefer to use density-based algorithms. I can use this kind of algorithms to research for extreme customers in a credit card fraud. This study aims customer segmentation by using customer behaviors.

Checked number of clusters when using KMeans models and the 8 clusters model was chosed based on both Elbow method and the Silhouette Scores. 

Below are the customer segmentation results:

Cluster 0: This customer group indicates a small group of customers who are small spenders with the lowest minimum payment.

Cluster 1: These customers purchase frequently with the highest installment purchase frequency percentage contrast of a lower cash advance percentage. This group is using their credit cards for a small number of purchases.

Cluster 2: This segment points out new customers with a lower credit limit and average balance level.

Cluster 3: This cluster targets a group of customers who have a high balance and cash advances. This group also has a low purchase frequency. We can assume that this customer segment uses their credit cards as a loan.

Cluster 4: A small group of customers with the highest credit limit and the highest minimum payments. We can assume that these customers tend to increase credit limits to follow up on their spending habits.

Cluster 5: This segment has the lowest INSTALLMENTS_PURCHASES and PRC_FULL_PAYMENT percentages.

Cluster 6: It is a similar customer segment with cluster 4 but with a lower Minimum Payment percentage.