In [4]:
# Fraud Detection Project - End-to-End Analysis

# 1. Data Cleaning: Missing Values, Outliers, and Multicollinearity
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# Load the dataset
df = pd.read_csv("Fraud.csv")

# Drop irrelevant columns
df = df.drop(columns=["nameOrig", "nameDest", "newbalanceOrig"])

# Check missing values
print("Missing Values:\n", df.isnull().sum())

# Encode categorical variable 'type'
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Remove zero/negative amount values as outliers
df = df[df['amount'] > 0]

# Define features and target
X = df.drop(columns=["isFraud"])
y = df["isFraud"]

# Check for multicollinearity using VIF
X_const = add_constant(X)
vif_df = pd.DataFrame()
vif_df["Feature"] = X_const.columns
vif_df["VIF"] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]
print("\nVIF Values:\n", vif_df)

# Drop high-VIF features (VIF > 10)
high_vif_features = vif_df[vif_df["VIF"] > 10]["Feature"]
high_vif_features = high_vif_features[high_vif_features != "const"]
X = X.drop(columns=high_vif_features)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# 2. Model: Logistic Regression
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# 3. Variable Selection Explanation
# We kept only numeric features relevant to transaction behavior
# Removed high-cardinality IDs and multicollinear features

# 4. Model Performance
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 5. Key Predictors of Fraud (Feature Importance)
importance = pd.Series(model.coef_[0], index=X.columns)
print("\nFeature Importance:\n", importance.sort_values(ascending=False))

# 6. Do These Factors Make Sense?
# Yes: High amounts, transaction type (TRANSFER/CASH_OUT), and low sender balance often relate to fraud.

# 7. Infrastructure Update Prevention Measures
# - Enforce transaction limits
# - Real-time anomaly detection
# - Improve account verification (KYC)
# - Regular audits of transaction logs

# 8. Measuring Effectiveness of Security Actions
# - Track fraud rate pre/post updates
# - Monitor model prediction accuracy
# - Reduction in false positives
# - Collect feedback from risk/fraud teams

Missing Values:
 step              0
type              0
amount            0
oldbalanceOrg     0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


  return 1 - self.ssr/self.centered_tss



VIF Values:
           Feature        VIF
0           const   6.714066
1            step   1.008467
2            type   1.177533
3          amount   1.404912
4   oldbalanceOrg   1.143371
5  oldbalanceDest  31.180263
6  newbalanceDest  32.553350
7  isFlaggedFraud        NaN

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    314230
           1       0.74      0.08      0.14       343

    accuracy                           1.00    314573
   macro avg       0.87      0.54      0.57    314573
weighted avg       1.00      1.00      1.00    314573


Confusion Matrix:
 [[314221      9]
 [   317     26]]

Feature Importance:
 type              1.722119e-01
step              4.600142e-02
amount            1.512097e-06
oldbalanceOrg     2.140775e-08
isFlaggedFraud    0.000000e+00
dtype: float64
