In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from os import path

data_dir = 'data/bank-transactions'
data_file = 'bank-transactions-treated.csv'
data_path = path.join(data_dir, data_file)

df = pd.read_csv(data_path)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 251810 entries, 0 to 251809
Data columns (total 5 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   CustGender               251810 non-null  float64
 1   CustAccountBalance       251810 non-null  float64
 2   TransactionAmount (INR)  251810 non-null  float64
 3   CustomerAge              251810 non-null  int64  
 4   TransactionTimestamp     251810 non-null  float64
dtypes: float64(4), int64(1)
memory usage: 9.6 MB


De acordo com as validações realizadas, percebemos que os melhores valores de K para o K-Means são 2 e 3. Por isso, os utilizaremos.

## REALIZANDO O AGRUPAMENTO

#### MinMaxScaler

In [3]:
from sklearn.preprocessing import MinMaxScaler

data_scaled = MinMaxScaler().fit_transform(df.values)

#### HDBSCAN

In [4]:
from hdbscan import HDBSCAN

hdbscan = HDBSCAN(alpha=1., min_cluster_size=600)
_ = hdbscan.fit(data_scaled)

In [5]:
np.unique(hdbscan.labels_, return_counts=True)

(array([-1,  0,  1,  2,  3]), array([ 25341,  77692,   4956,   1016, 142805]))

#### K-Means (K = 2)

In [6]:
from sklearn.cluster import KMeans

km2 = KMeans(n_clusters=2, init='k-means++', n_init=100, random_state=1917)
preds_k2 = km2.fit_predict(data_scaled)

In [8]:
np.unique(preds_k2, return_counts=True)

(array([0, 1], dtype=int32), array([174118,  77692]))

#### K-Means (K = 3)

In [9]:
from sklearn.cluster import KMeans

km3 = KMeans(n_clusters=3, init='k-means++', n_init=100, random_state=1917)
preds_k3 = km3.fit_predict(data_scaled)

In [10]:
np.unique(preds_k3, return_counts=True)

(array([0, 1, 2], dtype=int32), array([77692, 83853, 90265]))

#### K-Means (K = 4)

In [11]:
from sklearn.cluster import KMeans

km4 = KMeans(n_clusters=4, init='k-means++', n_init=100, random_state=1917)
preds_k4 = km4.fit_predict(data_scaled)

In [12]:
np.unique(preds_k4, return_counts=True)

(array([0, 1, 2, 3], dtype=int32), array([83853, 40462, 90265, 37230]))

### SALVANDO OS AGRUPAMENTOS DE VOLTA NO CONJUNTO

Para podermos interpretar os agrupamentos, salvá-los-emos no DataFrame para podermos analisá-los depois.

In [14]:
df.head(5)

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge,TransactionTimestamp
0,1.0,17874.44,459.0,26,1470159000.0
1,1.0,866503.21,2060.0,49,1470159000.0
2,1.0,973.46,566.0,30,1470170000.0
3,0.0,95075.54,148.0,40,1470168000.0
4,0.0,4279.22,289.11,38,1470177000.0


In [20]:
df['KMeans2'] = preds_k2
df['KMeans3'] = preds_k3
df['KMeans4'] = preds_k4
df['HDBSCAN'] = hdbscan.labels_

In [21]:
df.head(5)

Unnamed: 0,CustGender,CustAccountBalance,TransactionAmount (INR),CustomerAge,TransactionTimestamp,KMeans2,KMeans3,HDBSCAN,KMeans4
0,1.0,17874.44,459.0,26,1470159000.0,1,0,0,1
1,1.0,866503.21,2060.0,49,1470159000.0,1,0,0,1
2,1.0,973.46,566.0,30,1470170000.0,1,0,0,1
3,0.0,95075.54,148.0,40,1470168000.0,0,2,3,2
4,0.0,4279.22,289.11,38,1470177000.0,0,2,3,2


In [25]:
data_file_new = 'bank-transactions-groups.csv'
data_path_new = path.join(data_dir, data_file_new)

df.to_csv(data_path_new, index=False)