In [5]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load transaction data from the chosen file
transactions = pd.read_excel('../data/bank.xlsx')
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 116201 entries, 0 to 116200
Data columns (total 9 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   Account No           116201 non-null  object        
 1   DATE                 116201 non-null  datetime64[ns]
 2   TRANSACTION DETAILS  113702 non-null  object        
 3   CHQ.NO.              905 non-null     float64       
 4   VALUE DATE           116201 non-null  datetime64[ns]
 5   WITHDRAWAL AMT       53549 non-null   float64       
 6   DEPOSIT AMT          62652 non-null   float64       
 7   BALANCE AMT          116201 non-null  float64       
 8   .                    116201 non-null  object        
dtypes: datetime64[ns](2), float64(4), object(3)
memory usage: 8.0+ MB


In [6]:
transactions.describe()

Unnamed: 0,DATE,CHQ.NO.,VALUE DATE,WITHDRAWAL AMT,DEPOSIT AMT,BALANCE AMT
count,116201,905.0,116201,53549.0,62652.0,116201.0
mean,2017-05-20 00:08:40.477448448,791614.503867,2017-05-20 00:04:43.288439808,4489190.0,3806586.0,-1404852000.0
min,2015-01-01 00:00:00,1.0,2015-01-01 00:00:00,0.01,0.01,-2045201000.0
25%,2016-05-30 00:00:00,704231.0,2016-05-30 00:00:00,3000.0,99000.0,-1690383000.0
50%,2017-06-05 00:00:00,873812.0,2017-06-05 00:00:00,47083.0,426500.0,-1661395000.0
75%,2018-05-26 00:00:00,874167.0,2018-05-26 00:00:00,5000000.0,4746411.0,-1236888000.0
max,2019-03-05 00:00:00,874525.0,2019-03-05 00:00:00,459447500.0,544800000.0,8500000.0
std,,151205.93291,,10848500.0,8683093.0,534820200.0


In [7]:
transactions.isnull().sum()

Account No                  0
DATE                        0
TRANSACTION DETAILS      2499
CHQ.NO.                115296
VALUE DATE                  0
WITHDRAWAL AMT          62652
DEPOSIT AMT             53549
BALANCE AMT                 0
.                           0
dtype: int64

In [0]:

# Standardize the data
scaler = StandardScaler()
transactions_scaled = scaler.fit_transform(transactions)

# DBSCAN clustering
eps_values = [0.1, 0.5, 1.0]  
min_samples_values = [5, 10, 15]  

best_score = -1
best_params = None
best_labels = None

# Number of clusters, ignoring noise
for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(transactions_scaled)
        score = len(set(labels)) - (1 if -1 in labels else 0)  
        if score > best_score:
            best_score = score
            best_params = (eps, min_samples)
            best_labels = labels

# Extracting the best parameters and clustering results
best_eps, best_min_samples = best_params
print("Best parameters: eps={}, min_samples={}".format(best_eps, best_min_samples))
print("Number of clusters found:", best_score)

# Assigning cluster labels to the original data
transactions['cluster'] = best_labels

# Accessing clustered transactions
for cluster_id in transactions['cluster'].unique():
    cluster_transactions = transactions[transactions['cluster'] == cluster_id]
    print(f"Cluster {cluster_id}:")
    print(cluster_transactions.head())

# Analyzing anomalies
anomaly_mask = best_labels == -1  
anomalies = transactions[anomaly_mask]

# Analyzing characteristics of anomalies
anomalies_description = anomalies.describe()
print("Characteristics of anomalies:")
print(anomalies_description)

# Visualizing clusters and outliers
plt.figure(figsize=(10, 6))

# Plotting clustered transactions
plt.scatter(transactions['BALANCE AMT'], transactions['DATE'], c=labels, cmap='viridis', alpha=0.5)
plt.colorbar(label='Cluster')
plt.title('DBSCAN Clustering of Transactions')
plt.xlabel('BALANCE AMT')
plt.ylabel('DATE')
plt.grid(True)

# Highlighting anomalies
plt.scatter(anomalies['BALANCE AMT'], anomalies['DATE'], color='red', label='Anomalies')
plt.legend()

plt.show()