In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN

# Load transaction data from the chosen file
transactions = pd.read_excel('../data/bank.xlsx')

In [2]:

# Standardize the data
scaler = StandardScaler()
transactions_scaled = scaler.fit_transform(transactions)

# DBSCAN clustering
eps_values = [0.1, 0.5, 1.0]  
min_samples_values = [5, 10, 15]  

best_score = -1
best_params = None
best_labels = None

# Number of clusters, ignoring noise
for eps in eps_values:
    for min_samples in min_samples_values:
        dbscan = DBSCAN(eps=eps, min_samples=min_samples)
        labels = dbscan.fit_predict(transactions_scaled)
        score = len(set(labels)) - (1 if -1 in labels else 0)  
        if score > best_score:
            best_score = score
            best_params = (eps, min_samples)
            best_labels = labels

# Extracting the best parameters and clustering results
best_eps, best_min_samples = best_params
print("Best parameters: eps={}, min_samples={}".format(best_eps, best_min_samples))
print("Number of clusters found:", best_score)

# Assigning cluster labels to the original data
transactions['cluster'] = best_labels

# Accessing clustered transactions
for cluster_id in transactions['cluster'].unique():
    cluster_transactions = transactions[transactions['cluster'] == cluster_id]
    print(f"Cluster {cluster_id}:")
    print(cluster_transactions.head())

# Analyzing anomalies
anomaly_mask = best_labels == -1  
anomalies = transactions[anomaly_mask]

# Analyzing characteristics of anomalies
anomalies_description = anomalies.describe()
print("Characteristics of anomalies:")
print(anomalies_description)

# Visualizing clusters and outliers
plt.figure(figsize=(10, 6))

# Plotting clustered transactions
plt.scatter(transactions['BALANCE AMT'], transactions['DATE'], c=labels, cmap='viridis', alpha=0.5)
plt.colorbar(label='Cluster')
plt.title('DBSCAN Clustering of Transactions')
plt.xlabel('BALANCE AMT')
plt.ylabel('DATE')
plt.grid(True)

# Highlighting anomalies
plt.scatter(anomalies['BALANCE AMT'], anomalies['DATE'], color='red', label='Anomalies')
plt.legend()

plt.show()

ValueError: could not convert string to float: "409000611074'"