In [None]:
"""
OBJECTIVE:
Build a machine learning model to detect fraudulent credit card transactions
using highly imbalanced data and evaluate it using Precision, Recall, F1,
ROC-AUC, and PR-AUC. Tune decision threshold to maximize recall.
"""

# ============================================================================
# STEP 1: Import Libraries
# ============================================================================
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    roc_curve,
    precision_recall_curve,
    auc
)

import warnings
warnings.filterwarnings("ignore")

In [None]:
# ============================================================================
# STEP 2: Load Data
# ============================================================================
df = pd.read_csv(r"D:\CODSOFT INTERNSHIP\Task_2\Dataset\fraud_train.csv")

print(df.info())
print("\nDATASET SHAPE:", df.shape)

In [None]:
# ============================================================================
# STEP 3: Data Cleaning
# ============================================================================
df.drop(columns=['cc_num', 'Unnamed: 0', 'trans_num'], inplace=True)

In [None]:
# ============================================================================
# STEP 4: Check Class Imbalance
# ============================================================================
class_counts = df['is_fraud'].value_counts()

print("\nClass Distribution:")
print(class_counts)
print("\nFraud Percentage:")
print(class_counts[1] / class_counts.sum() * 100)

# Plot class distribution
sns.countplot(x='is_fraud', data=df)
plt.title("Class Distribution")
plt.show()


In [None]:
# ============================================================================
# STEP 5: Feature Engineering
# ============================================================================
# Convert to datetime
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'])
df['dob'] = pd.to_datetime(df['dob'])

# Create time-based features
df['transaction_hour'] = df['trans_date_trans_time'].dt.hour
df['transaction_day'] = df['trans_date_trans_time'].dt.day
df['transaction_month'] = df['trans_date_trans_time'].dt.month

# Create age feature
df['customer_age'] = (df['trans_date_trans_time'] - df['dob']).dt.days // 365

# Drop original datetime columns
df = df.drop(['trans_date_trans_time', 'dob'], axis=1)

In [None]:
# ============================================================================
# STEP 6: Split Features and Target
# ============================================================================
X = df.drop('is_fraud', axis=1)
y = df['is_fraud']

In [None]:
# ============================================================================
# STEP 7: Train-Test Split
# ============================================================================
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

In [None]:
# ============================================================================
# STEP 8: Memory Fix - Reduce High Cardinality
# ============================================================================
print("\nFixing memory issues...")

# Find categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns

# Reduce categories in each column
for col in categorical_cols:
    n_unique = X_train[col].nunique()
    
    if n_unique > 100:
        top_categories = X_train[col].value_counts().head(100).index
        X_train[col] = X_train[col].apply(lambda x: x if x in top_categories else 'Other')
        X_test[col] = X_test[col].apply(lambda x: x if x in top_categories else 'Other')
        print(f"Reduced {col}: {n_unique} -> {X_train[col].nunique()} categories")

In [None]:
# ============================================================================
# STEP 9: Preprocessing Pipeline
# ============================================================================
numeric_features = X_train.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=True), 
         categorical_features)
    ]
)

In [None]:

# ============================================================================
# STEP 10: Logistic Regression Model
# ============================================================================
print("\n" + "="*60)
print("TRAINING LOGISTIC REGRESSION")
print("="*60)

lr_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', LogisticRegression(
        class_weight='balanced',
        max_iter=1000,
        solver='saga',
        n_jobs=-1,
        random_state=42
    ))
])

# Train
lr_pipeline.fit(X_train, y_train)

# Predict
y_pred_lr = lr_pipeline.predict(X_test)
y_prob_lr = lr_pipeline.predict_proba(X_test)[:, 1]

# Evaluate
print("\nLogistic Regression Results:")
print("Accuracy :", accuracy_score(y_test, y_pred_lr))
print("Precision:", precision_score(y_test, y_pred_lr))
print("Recall   :", recall_score(y_test, y_pred_lr))
print("F1 Score :", f1_score(y_test, y_pred_lr))
print("ROC AUC  :", roc_auc_score(y_test, y_prob_lr))

# Confusion Matrix
cm_lr = confusion_matrix(y_test, y_pred_lr)
sns.heatmap(cm_lr, annot=True, fmt='d', cmap='Blues')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression Confusion Matrix")
plt.show()

# ROC Curve
fpr_lr, tpr_lr, _ = roc_curve(y_test, y_prob_lr)
plt.plot(fpr_lr, tpr_lr, label="Logistic Regression")
plt.plot([0, 1], [0, 1], linestyle='--')
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve - Logistic Regression")
plt.legend()
plt.show()

# Precision-Recall Curve
precision_lr, recall_lr, _ = precision_recall_curve(y_test, y_prob_lr)
pr_auc_lr = auc(recall_lr, precision_lr)

plt.plot(recall_lr, precision_lr, label="Logistic Regression")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title(f"Precision-Recall Curve (PR-AUC = {pr_auc_lr:.4f})")
plt.legend()
plt.grid()
plt.show()

print("PR AUC:", pr_auc_lr)


In [None]:
# ============================================================================
# STEP 11: Random Forest Model
# ============================================================================
print("\n" + "="*60)
print("TRAINING RANDOM FOREST")
print("="*60)

rf_pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=100,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# Train
rf_pipeline.fit(X_train, y_train)

# Predict
y_pred_rf = rf_pipeline.predict(X_test)
y_prob_rf = rf_pipeline.predict_proba(X_test)[:, 1]

# Evaluate
print("\nRandom Forest Results:")
print("Accuracy :", accuracy_score(y_test, y_pred_rf))
print("Precision:", precision_score(y_test, y_pred_rf))
print("Recall   :", recall_score(y_test, y_pred_rf))
print("F1 Score :", f1_score(y_test, y_pred_rf))
print("ROC AUC  :", roc_auc_score(y_test, y_prob_rf))

# Confusion Matrix
cm_rf = confusion_matrix(y_test, y_pred_rf)
sns.heatmap(cm_rf, annot=True, fmt='d', cmap='Greens')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest Confusion Matrix")
plt.show()

In [None]:
# ============================================================================
# STEP 12: Threshold Optimization
# ============================================================================
print("\n" + "="*60)
print("THRESHOLD OPTIMIZATION")
print("="*60)

# Test different thresholds
thresholds = np.arange(0.05, 0.9, 0.05)
results_lr = []
results_rf = []

for t in thresholds:
    # Logistic Regression
    y_pred_lr_t = (y_prob_lr >= t).astype(int)
    results_lr.append([
        t,
        precision_score(y_test, y_pred_lr_t, zero_division=0),
        recall_score(y_test, y_pred_lr_t),
        f1_score(y_test, y_pred_lr_t, zero_division=0)
    ])
    
    # Random Forest
    y_pred_rf_t = (y_prob_rf >= t).astype(int)
    results_rf.append([
        t,
        precision_score(y_test, y_pred_rf_t, zero_division=0),
        recall_score(y_test, y_pred_rf_t),
        f1_score(y_test, y_pred_rf_t, zero_division=0)
    ])

# Create DataFrames
threshold_df_lr = pd.DataFrame(results_lr, 
                               columns=["Threshold", "Precision", "Recall", "F1"])
threshold_df_rf = pd.DataFrame(results_rf, 
                               columns=["Threshold", "Precision", "Recall", "F1"])

print("\nLogistic Regression - Threshold Results:")
print(threshold_df_lr)

print("\nRandom Forest - Threshold Results:")
print(threshold_df_rf)

# Find best thresholds (maximize recall)
best_threshold_lr = threshold_df_lr.loc[threshold_df_lr['Recall'].idxmax(), 'Threshold']
best_threshold_rf = threshold_df_rf.loc[threshold_df_rf['Recall'].idxmax(), 'Threshold']

print(f"\nBest Threshold - Logistic Regression: {best_threshold_lr}")
print(f"Best Threshold - Random Forest: {best_threshold_rf}")

# Plot threshold analysis
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

# Logistic Regression
ax1.plot(threshold_df_lr['Threshold'], threshold_df_lr['Precision'], 
         marker='o', label='Precision')
ax1.plot(threshold_df_lr['Threshold'], threshold_df_lr['Recall'], 
         marker='s', label='Recall')
ax1.plot(threshold_df_lr['Threshold'], threshold_df_lr['F1'], 
         marker='^', label='F1')
ax1.axvline(best_threshold_lr, color='red', linestyle='--', 
            label=f'Best: {best_threshold_lr:.2f}')
ax1.set_xlabel("Threshold")
ax1.set_ylabel("Score")
ax1.set_title("Logistic Regression - Threshold Tuning")
ax1.legend()
ax1.grid()

# Random Forest
ax2.plot(threshold_df_rf['Threshold'], threshold_df_rf['Precision'], 
         marker='o', label='Precision')
ax2.plot(threshold_df_rf['Threshold'], threshold_df_rf['Recall'], 
         marker='s', label='Recall')
ax2.plot(threshold_df_rf['Threshold'], threshold_df_rf['F1'], 
         marker='^', label='F1')
ax2.axvline(best_threshold_rf, color='red', linestyle='--', 
            label=f'Best: {best_threshold_rf:.2f}')
ax2.set_xlabel("Threshold")
ax2.set_ylabel("Score")
ax2.set_title("Random Forest - Threshold Tuning")
ax2.legend()
ax2.grid()

plt.tight_layout()
plt.show()

In [None]:
# ============================================================================
# STEP 13: Apply Optimized Thresholds
# ============================================================================
print("\n" + "="*60)
print("RESULTS WITH OPTIMIZED THRESHOLDS")
print("="*60)

# Apply optimized thresholds
y_pred_lr_opt = (y_prob_lr >= best_threshold_lr).astype(int)
y_pred_rf_opt = (y_prob_rf >= best_threshold_rf).astype(int)

print("\nLogistic Regression (Optimized):")
print(classification_report(y_test, y_pred_lr_opt, 
                          target_names=['Legitimate', 'Fraud']))

print("\nRandom Forest (Optimized):")
print(classification_report(y_test, y_pred_rf_opt, 
                          target_names=['Legitimate', 'Fraud']))

# Confusion Matrix - Logistic Regression (Optimized)
cm_lr_opt = confusion_matrix(y_test, y_pred_lr_opt)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_lr_opt, annot=True, fmt='d', cmap='Reds')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Logistic Regression - Optimized Threshold")
plt.show()

# Confusion Matrix - Random Forest (Optimized)
cm_rf_opt = confusion_matrix(y_test, y_pred_rf_opt)
plt.figure(figsize=(6, 5))
sns.heatmap(cm_rf_opt, annot=True, fmt='d', cmap='Oranges')
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Random Forest - Optimized Threshold")
plt.show()

In [None]:
# ============================================================================
# STEP 14: Final Model Comparison
# ============================================================================
print("\n" + "="*60)
print("FINAL MODEL COMPARISON")
print("="*60)

comparison = pd.DataFrame({
    'Model': [
        'Logistic Regression (Default)',
        'Logistic Regression (Optimized)',
        'Random Forest (Default)',
        'Random Forest (Optimized)'
    ],
    'Threshold': [
        0.50,
        best_threshold_lr,
        0.50,
        best_threshold_rf
    ],
    'Precision': [
        precision_score(y_test, y_pred_lr),
        precision_score(y_test, y_pred_lr_opt),
        precision_score(y_test, y_pred_rf),
        precision_score(y_test, y_pred_rf_opt)
    ],
    'Recall': [
        recall_score(y_test, y_pred_lr),
        recall_score(y_test, y_pred_lr_opt),
        recall_score(y_test, y_pred_rf),
        recall_score(y_test, y_pred_rf_opt)
    ],
    'F1': [
        f1_score(y_test, y_pred_lr),
        f1_score(y_test, y_pred_lr_opt),
        f1_score(y_test, y_pred_rf),
        f1_score(y_test, y_pred_rf_opt)
    ],
    'ROC-AUC': [
        roc_auc_score(y_test, y_prob_lr),
        roc_auc_score(y_test, y_prob_lr),
        roc_auc_score(y_test, y_prob_rf),
        roc_auc_score(y_test, y_prob_rf)
    ]
})

print("\n")
print(comparison.to_string(index=False))

# Plot comparison
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Precision
axes[0, 0].bar(range(len(comparison)), comparison['Precision'])
axes[0, 0].set_xticks(range(len(comparison)))
axes[0, 0].set_xticklabels(comparison['Model'], rotation=45, ha='right')
axes[0, 0].set_ylabel("Precision")
axes[0, 0].set_title("Precision Comparison")

# Recall
axes[0, 1].bar(range(len(comparison)), comparison['Recall'])
axes[0, 1].set_xticks(range(len(comparison)))
axes[0, 1].set_xticklabels(comparison['Model'], rotation=45, ha='right')
axes[0, 1].set_ylabel("Recall")
axes[0, 1].set_title("Recall Comparison")

# F1 Score
axes[1, 0].bar(range(len(comparison)), comparison['F1'])
axes[1, 0].set_xticks(range(len(comparison)))
axes[1, 0].set_xticklabels(comparison['Model'], rotation=45, ha='right')
axes[1, 0].set_ylabel("F1 Score")
axes[1, 0].set_title("F1 Score Comparison")

# ROC-AUC
axes[1, 1].bar(range(len(comparison)), comparison['ROC-AUC'])
axes[1, 1].set_xticks(range(len(comparison)))
axes[1, 1].set_xticklabels(comparison['Model'], rotation=45, ha='right')
axes[1, 1].set_ylabel("ROC-AUC")
axes[1, 1].set_title("ROC-AUC Comparison")

plt.tight_layout()
plt.show()


In [None]:
# ============================================================================
# STEP 15: Final Recommendation
# ============================================================================
best_model_idx = comparison['Recall'].idxmax()
best_model = comparison.iloc[best_model_idx]

print("\n" + "="*60)
print("RECOMMENDATION")
print("="*60)
print(f"\nBest Model: {best_model['Model']}")
print(f"\nPerformance:")
print(f"  Threshold: {best_model['Threshold']:.2f}")
print(f"  Precision: {best_model['Precision']:.4f}")
print(f"  Recall:    {best_model['Recall']:.4f}")
print(f"  F1 Score:  {best_model['F1']:.4f}")
print(f"  ROC-AUC:   {best_model['ROC-AUC']:.4f}")

print("\n" + "="*60)
print("ANALYSIS COMPLETE!")
print("="*60)

In [None]:
import joblib
joblib.dump(lr_pipeline, "fraud_model.pkl")
joblib.dump(lr_pipeline, "logistic_model.pkl")
joblib.dump(rf_pipeline, "random_forest_model.pkl")
joblib.dump(preprocessor, "preprocessor.pkl")
