In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def prepare_data(df):
    # Drop unnecessary columns
    df = df.drop(["transaction_id", "date"], axis=1)

    # Encode categorical features
    categorical_cols = ["card_type", "merchant", "gender"]
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

    # Scale numerical features
    numerical_cols = ["amount", "age", "income"]
    scaler = StandardScaler()
    df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
    
    return df


def fraud_detection_model(df):
    print("\n--- Fraud Detection Model (with Hyperparameter Tuning) ---")
    X = df.drop("is_fraud", axis=1)
    y = df["is_fraud"]

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=42, stratify=y
    )

    # Define Random Forest with hyperparameter tuning
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    }

    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(
        estimator=rf,
        param_grid=param_grid,
        cv=3,
        n_jobs=-1,
        verbose=1,
        scoring='f1'
    )

    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    print(f"\nBest Hyperparameters: {grid_search.best_params_}")

    y_pred = best_model.predict(X_test)

    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))

    # Feature Importance
    feature_importances = pd.Series(best_model.feature_importances_, index=X.columns).sort_values(ascending=False)
    print("\nFeature Importances:")
    print(feature_importances)


def customer_segmentation_model(df):
    print("\n--- Customer Segmentation Model ---")
    # Aggregate transactions per customer
    customer_df = df.groupby("customer_id").agg({
        "amount": "sum",
        "age": "first",
        "income": "first",
        "is_fraud": "sum"  # Number of fraudulent transactions per customer
    }).reset_index()

    X_segmentation = customer_df[["amount", "age", "income"]]

    # Scale features for clustering
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_segmentation)

    # Determine optimal K using the Elbow Method
    wcss = []
    K_range = range(1, 11)
    for k in K_range:
        kmeans = KMeans(n_clusters=k, init='k-means++', random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        wcss.append(kmeans.inertia_)

    # Save Elbow Plot
    plt.figure(figsize=(10, 6))
    plt.plot(K_range, wcss, marker='o', color='b')
    plt.title("Elbow Method for Optimal K")
    plt.xlabel("Number of Clusters (K)")
    plt.ylabel("WCSS")
    plt.grid(True)
    plt.savefig("C:/Users/ajroy/OneDrive/Desktop/CREDIT/docs/elbow_method.png")
    plt.close()

    # Find optimal k automatically using "elbow detection"
    deltas = np.diff(wcss)
    double_deltas = np.diff(deltas)
    optimal_k = np.argmin(double_deltas) + 2  # +2 because of diff index shift
    print(f"\nOptimal number of clusters detected: {optimal_k}")

    # Apply KMeans
    kmeans = KMeans(n_clusters=optimal_k, init='k-means++', random_state=42, n_init=10)
    customer_df["cluster"] = kmeans.fit_predict(X_scaled)

    print("\nCustomer Segments (first 5 rows with cluster):")
    print(customer_df.head())

    # Save cluster visualization
    plt.figure(figsize=(10, 8))
    sns.scatterplot(x=customer_df["amount"], y=customer_df["income"], hue=customer_df["cluster"], palette="viridis")
    plt.title("Customer Segments")
    plt.savefig("C:/Users/ajroy/OneDrive/Desktop/CREDIT/docs/customer_segments.png")
    plt.close()

    print("\n--- Model Interpretation ---")
    print("â€¢ Fraud Detection Model: Random Forest tuned using GridSearchCV for optimal performance.")
    print("â€¢ Customer Segmentation Model: Automatically selected optimal K and visualized clusters.")


# Main Execution
file_path = "C:/Users/ajroy/OneDrive/Desktop/CREDIT/DATA/credit_card_transactions.csv"
df = pd.read_csv(file_path)

processed_df = prepare_data(df.copy())
fraud_detection_model(processed_df.copy())
customer_segmentation_model(processed_df.copy())

print("\nâœ… Machine learning models trained and evaluated successfully.")
print("ðŸ“Š Visualizations saved to: CREDIT/docs/")



--- Fraud Detection Model (with Hyperparameter Tuning) ---
Fitting 3 folds for each of 216 candidates, totalling 648 fits

Best Hyperparameters: {'bootstrap': False, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2940
           1       1.00      0.70      0.82        60

    accuracy                           0.99      3000
   macro avg       1.00      0.85      0.91      3000
weighted avg       0.99      0.99      0.99      3000


Confusion Matrix:
[[2940    0]
 [  18   42]]

Feature Importances:
amount         0.855176
customer_id    0.043075
income         0.041392
age            0.027125
merchant       0.020081
card_type      0.009674
gender         0.003476
dtype: float64

--- Customer Segmentation Model ---

Optimal number of clusters detected: 7

Customer Segments (first 5 rows with cluster):
   customer_id    a