# Anomaly Detection in Credit Card Transactions with K-Means

This notebook applies **K-Means clustering** to detect anomalies based on distance to cluster centers. The optimal number of clusters is selected using the **silhouette score**.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# For reproducibility
np.random.seed(42)

## Upload Data

In [None]:
url = 'https://raw.githubusercontent.com/trkrkn/aiforfinance/main/transaction_data.csv'
df_original = pd.read_csv(url)

## Preprocess Data

In [None]:
# Make a copy to avoid altering the original
df = df_original.copy()

In [None]:
# Add derived feature: day of week (0 = Monday, 6 = Sunday)
df['date'] = pd.to_datetime(df['date'])
df['day_of_week'] = df['date'].dt.dayofweek
df = df.drop(['date'], axis = 1)

# Separate features
features = df[['sender', 'receiver', 'latitude', 'longitude', 'amount', 'day_of_week']]

# One-hot encode sender and receiver
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
cat_encoded = encoder.fit_transform(features[['sender', 'receiver']])
cat_feature_names = encoder.get_feature_names_out(['sender', 'receiver'])

# Scale numeric features
scaler = StandardScaler()
num_scaled = scaler.fit_transform(features[['latitude', 'longitude', 'amount', 'day_of_week']].values)

# Combine all features
from numpy import hstack
X_combined = hstack([num_scaled, cat_encoded])

## Optimize Number of Clusters

In [None]:
silhouette_scores = {}
for k in range(2, 10):
    kmeans_tmp = KMeans(n_clusters=k, random_state=0)
    labels = kmeans_tmp.fit_predict(X_combined)
    score = silhouette_score(X_combined, labels)
    silhouette_scores[k] = score

optimal_k = max(silhouette_scores, key=silhouette_scores.get)
print(f"Optimal number of clusters: {optimal_k} (Silhouette Score = {silhouette_scores[optimal_k]:.2f})")


In [None]:
plt.figure(figsize=(8, 5))
plt.plot(list(silhouette_scores.keys()), list(silhouette_scores.values()), marker='o')
plt.title("Silhouette Scores for Different k")
plt.xlabel("Number of Clusters (k)")
plt.ylabel("Silhouette Score")
plt.grid(True)
plt.show()


## Apply K-Means and Detect Anomalies

In [None]:
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
clusters = kmeans.fit_predict(X_combined)
centroids = kmeans.cluster_centers_
distances = np.linalg.norm(X_combined - centroids[clusters], axis=1)

df['Cluster'] = clusters
df['Distance_to_Center'] = distances
threshold = np.quantile(distances, 0.95)
df['Anomaly'] = distances > threshold

In [None]:
df[['sender', 'receiver', 'amount', 'Cluster', 'Distance_to_Center', 'Anomaly']].head(10)

In [None]:
len(df[df.Anomaly == True])