# Credit Card Transactions Analysis Project

This notebook analyzes credit card transactions data from India, including:
1. Data Cleaning and Preprocessing
2. Clustering Analysis (K-means, CAH, DBSCAN, GMM)
3. Amount Prediction using Ensemble Methods
4. Model Selection and Evaluation

In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import joblib

# Set display options
pd.set_option('display.max_columns', None)
plt.style.use('seaborn-v0_8')
%matplotlib inline

In [None]:
# Read the dataset
df = pd.read_csv('projet1.csv')

# Display basic information
print("\nDataset Info:")
print(df.info())

print("\nFirst few rows:")
print(df.head())

print("\nBasic statistics:")
print(df.describe())

print("\nMissing values:")
print(df.isnull().sum())

In [None]:
# Data Cleaning and Preprocessing
def handle_missing_values(df):
    df_clean = df.copy()
    
    # Fill numerical missing values with median
    numerical_columns = df_clean.select_dtypes(include=['int64', 'float64']).columns
    for col in numerical_columns:
        df_clean[col].fillna(df_clean[col].median(), inplace=True)
    
    # Fill categorical missing values with mode
    categorical_columns = df_clean.select_dtypes(include=['object']).columns
    for col in categorical_columns:
        df_clean[col].fillna(df_clean[col].mode()[0], inplace=True)
    
    return df_clean

# Handle missing values
df_clean = handle_missing_values(df)

# Encode categorical variables and save encoders
label_encoders = {}
df_encoded = df_clean.copy()

categorical_columns = df_clean.select_dtypes(include=['object']).columns
for col in categorical_columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_clean[col])
    label_encoders[col] = le

# Scale the features
scaler = StandardScaler()
df_scaled = pd.DataFrame(
    scaler.fit_transform(df_encoded),
    columns=df_encoded.columns
)

# Save preprocessors
joblib.dump(scaler, 'scaler.joblib')
joblib.dump(label_encoders, 'label_encoders.joblib')

In [None]:
# Clustering Analysis
def evaluate_clustering(X, labels, algorithm_name):
    from sklearn.metrics import silhouette_score, calinski_harabasz_score
    
    silhouette = silhouette_score(X, labels)
    calinski = calinski_harabasz_score(X, labels)
    
    print(f"\n{algorithm_name} Metrics:")
    print(f"Silhouette Score: {silhouette:.3f}")
    print(f"Calinski-Harabasz Score: {calinski:.3f}")

# Apply clustering algorithms
n_clusters = 5

# K-means
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
kmeans_labels = kmeans.fit_predict(df_scaled)
evaluate_clustering(df_scaled, kmeans_labels, 'K-means')

# Hierarchical Clustering
hierarchical = AgglomerativeClustering(n_clusters=n_clusters)
hierarchical_labels = hierarchical.fit_predict(df_scaled)
evaluate_clustering(df_scaled, hierarchical_labels, 'Hierarchical')

# DBSCAN
dbscan = DBSCAN(eps=0.5, min_samples=5)
dbscan_labels = dbscan.fit_predict(df_scaled)
evaluate_clustering(df_scaled, dbscan_labels, 'DBSCAN')

# GMM
gmm = GaussianMixture(n_components=n_clusters, random_state=42)
gmm_labels = gmm.fit_predict(df_scaled)
evaluate_clustering(df_scaled, gmm_labels, 'GMM')

In [None]:
# Prepare data for prediction
X = df_encoded.drop('Amount', axis=1)
y = df_encoded['Amount']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate models
def evaluate_model(model, X_train, X_test, y_train, y_test, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)
    
    print(f"\n{model_name} Performance:")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2 Score: {r2:.3f}")
    
    return model

# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model = evaluate_model(rf_model, X_train, X_test, y_train, y_test, 'Random Forest')

# XGBoost
xgb_model = xgb.XGBRegressor(random_state=42)
xgb_model = evaluate_model(xgb_model, X_train, X_test, y_train, y_test, 'XGBoost')

# AdaBoost
ada_model = AdaBoostRegressor(random_state=42)
ada_model = evaluate_model(ada_model, X_train, X_test, y_train, y_test, 'AdaBoost')

# Save the best model (assuming XGBoost performs best)
joblib.dump(xgb_model, 'credit_card_model.joblib')