In [3]:
# Step 1: Dataset Loading
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv(r"C:\Users\SATULURI VINAY\Downloads\fraud_dataset.csv")
print("Dataset Loaded Successfully! ✅")
print("Dataset Preview:\n", df.head())

# Step 2: Data Preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Drop completely missing column (Check_Number)
df = df.drop(columns=["Check_Number"], errors="ignore")

# Encode categorical columns using Label Encoding
categorical_columns = df.select_dtypes(include=["object"]).columns
label_encoders = {}
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Convert boolean values to integers
df = df.astype({col: "int" for col in df.select_dtypes(include=["bool"]).columns})

# Define features & target
X = df.drop(columns=["Suspicious_Activity_Flag"])
y = df["Suspicious_Activity_Flag"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("\nData Preprocessing Completed! ✅")
print("Processed Data Sample:\n", X_train.head())

# Step 3: Model Comparison
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(eval_metric="logloss")
}

results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    roc_auc = roc_auc_score(y_test, y_pred)
    results.append([name, accuracy, precision, recall, f1, roc_auc])

results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-Score", "ROC-AUC"])
print("\nModel Comparison Completed! ✅")
print(results_df)

# Step 4: Handling Imbalance with SMOTE
from imblearn.over_sampling import SMOTE
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)
print("\nSMOTE Applied! ✅")
print("Resampled Class Distribution:")
print(pd.Series(y_train_resampled).value_counts())

# Step 5: Feature Engineering
from sklearn.feature_selection import SelectFromModel
feature_selector = SelectFromModel(RandomForestClassifier(n_estimators=100, random_state=42))
feature_selector.fit(X_train_resampled, y_train_resampled)
X_train_selected = feature_selector.transform(X_train_resampled)
X_test_selected = feature_selector.transform(X_test)
selected_features = X.columns[feature_selector.get_support()]
print("\nFeature Engineering Completed! ✅")
print("Selected Features:\n", selected_features)

# Step 6: Hyperparameter Tuning
from sklearn.model_selection import GridSearchCV
param_grid_rf = {"n_estimators": [100, 200], "max_depth": [10, 20]}
best_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=3, scoring='f1')
best_rf.fit(X_train_selected, y_train_resampled)
best_rf = best_rf.best_estimator_

param_grid_xgb = {"n_estimators": [100, 200], "max_depth": [3, 6]}
best_xgb = GridSearchCV(XGBClassifier(eval_metric='logloss'), param_grid_xgb, cv=3, scoring='f1')
best_xgb.fit(X_train_selected, y_train_resampled)
best_xgb = best_xgb.best_estimator_

print("\nHyperparameter Tuning Completed! ✅")
print("Best Random Forest Params:", best_rf.get_params())
print("Best XGBoost Params:", best_xgb.get_params())

# Step 7: Training and Evaluation
def evaluate_model(name, y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"\n{name} Performance:")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1-Score: {f1:.4f}")

y_pred_rf = best_rf.predict(X_test_selected)
y_pred_xgb = best_xgb.predict(X_test_selected)
evaluate_model("Random Forest", y_test, y_pred_rf)
evaluate_model("XGBoost", y_test, y_pred_xgb)

# Step 8: Weighted Voting Classifier
from sklearn.ensemble import VotingClassifier
weighted_voting_clf = VotingClassifier(estimators=[("RF", best_rf), ("XGB", best_xgb)], voting="soft", weights=[0.6, 0.4])
weighted_voting_clf.fit(X_train_selected, y_train_resampled)
print("\nEnsemble Model Training Completed! ✅")

# Step 9: Saving the Model
import joblib
joblib.dump(weighted_voting_clf, "final_fraud_detection_model.pkl")
print("\nFinal Model Saved Successfully! ✅")

# Step 10: Evaluating on Test Data
final_model = joblib.load("final_fraud_detection_model.pkl")
final_pred = final_model.predict(X_test_selected)
evaluate_model("Final Ensemble Model", y_test, final_pred)


Dataset Loaded Successfully! ✅
Dataset Preview:
    Transaction_ID  User_ID  Transaction_Amount Transaction_Type  \
0          305660     4175              341.78       withdrawal   
1          840039     1187             6435.22       withdrawal   
2          558007     2718              477.72       withdrawal   
3          781655     2337             9031.90       withdrawal   
4          948672     1424             5572.46          deposit   

             Transaction_Date Transaction_Location Merchant_Category  \
0  2025-01-27 00:02:26.651124                Paris        restaurant   
1  2024-04-07 00:47:25.977203                Dubai            travel   
2  2024-04-09 12:42:25.668988               Berlin        restaurant   
3  2024-11-12 02:48:26.081949               Berlin       electronics   
4  2024-11-20 07:40:26.813391                Dubai     entertainment   

   Merchant_ID  Account_Balance_Before_Transaction  \
0         6631                             2616.95   
1      

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



Model Comparison Completed! ✅
                 Model  Accuracy  Precision    Recall  F1-Score   ROC-AUC
0  Logistic Regression    0.7000   0.000000  0.000000  0.000000  0.500000
1        Random Forest    0.8478   1.000000  0.492667  0.660116  0.746333
2              XGBoost    0.8424   0.957584  0.496667  0.654083  0.743619

SMOTE Applied! ✅
Resampled Class Distribution:
Suspicious_Activity_Flag
1    28000
0    28000
Name: count, dtype: int64

Feature Engineering Completed! ✅
Selected Features:
 Index(['Transaction_ID', 'User_ID', 'Transaction_Amount', 'Transaction_Date',
       'Merchant_ID', 'Account_Balance_Before_Transaction',
       'Account_Balance_After_Transaction', 'Device_ID', 'IP_Address',
       'Average_Transaction_Amount', 'Transaction_Speed',
       'Transaction_Amount_Deviation', 'Fraudulent_Signature',
       'Transaction_Merchant_Rating', 'IP_Address_Fraud_Risk_Score'],
      dtype='object')

Hyperparameter Tuning Completed! ✅
Best Random Forest Params: {'bootstrap':

In [1]:
import os
print(os.listdir())  # This should show final_fraud_detection_model.pkl


['.ipynb_checkpoints', 'ensemble_model_performance.csv', 'final_fraud_detection_model.pkl', 'final_model_performance.csv', 'final_test_results.csv', 'fraud_detection_dataset.csv', 'model_implementation.ipynb', 'model_performance.csv', 'Untitled.ipynb']


In [2]:
pip install fastapi uvicorn joblib nest-asyncio pyngrok


Note: you may need to restart the kernel to use updated packages.


In [1]:
!ngrok authtoken 2tIMrVaK80DycXI9ONQa8vr7frN_2BEFZguJtGX8bZEfXBn3H

Downloading ngrok ...
Downloading ngrok: 0%
Downloading ngrok: 1%
Downloading ngrok: 2%
Downloading ngrok: 3%
Downloading ngrok: 4%
Downloading ngrok: 5%
Downloading ngrok: 6%
Downloading ngrok: 7%
Downloading ngrok: 8%
Downloading ngrok: 9%
Downloading ngrok: 10%
Downloading ngrok: 11%
Downloading ngrok: 12%
Downloading ngrok: 13%
Downloading ngrok: 14%
Downloading ngrok: 15%
Downloading ngrok: 16%
Downloading ngrok: 17%
Downloading ngrok: 18%
Downloading ngrok: 19%
Downloading ngrok: 20%
Downloading ngrok: 21%
Downloading ngrok: 22%
Downloading ngrok: 23%
Downloading ngrok: 24%
Downloading ngrok: 25%
Downloading ngrok: 26%
Downloading ngrok: 27%
Downloading ngrok: 28%
Downloading ngrok: 29%
Downloading ngrok: 30%
Downloading ngrok: 31%
Downloading ngrok: 32%
Downloading ngrok: 33%
Downloading ngrok: 34%
Downloading ngrok: 35%
Downloading ngrok: 36%
Downloading ngrok: 37%
Downloading ngrok: 38%
Downloading ngrok: 39%
Downloading ngrok: 40%
Downloading ngrok: 41%
Downloading ngrok: 42%

In [None]:
from pyngrok import ngrok

# Set ngrok authentication token
ngrok.set_auth_token("2tIMrVaK80DycXI9ONQa8vr7frN_2BEFZguJtGX8bZEfXBn3H")  # Replace with your token

# Start ngrok and get the public URL
public_url = ngrok.connect(8000).public_url
print(f"🚀 Public URL: {public_url}")


In [None]:
from fastapi import FastAPI, HTTPException
import joblib
import numpy as np
from pydantic import BaseModel

# Initialize FastAPI app
app = FastAPI()

# Load your trained fraud detection model
model = joblib.load("final_fraud_detection_model.pkl")

# Get the expected number of features
EXPECTED_FEATURES = model.n_features_in_

# Define request body model
class TransactionData(BaseModel):
    features: list  # Expecting a list of transaction features

# Define the root endpoint
@app.get("/")
def read_root():
    return {"message": "Fraud Detection API is running!"}

# Define the prediction endpoint
@app.post("/predict")
async def predict(data: TransactionData):
    features = data.features
    
    # Check if the input length matches the expected feature count
    if len(features) != EXPECTED_FEATURES:
        raise HTTPException(
            status_code=400,
            detail=f"Expected {EXPECTED_FEATURES} features, but got {len(features)}."
        )

    features_array = np.array(features).reshape(1, -1)  # Reshape input

    # Predict fraud (1) or not (0)
    prediction = model.predict(features_array)[0]
    probability = model.predict_proba(features_array)[0][1]  # Probability of fraud

    return {
        "fraud_prediction": int(prediction),
        "fraud_probability": round(probability, 4)  # Rounded probability
    }
