Data Loading & Initial Cleaning

In [None]:
import pandas as pd#data manupulation and analysis

# Load dataset with explicit dtype specification to handle mixed types
data = pd.read_csv("onlinefraud.csv", dtype={'newbalanceOrig': str})  # Treat as string initially

# Convert empty strings to 0 and then to float
data['newbalanceOrig'] = data['newbalanceOrig'].replace(r'^\s*$', '0', regex=True).astype(float)

# Show first few rows
print(data.head())

# Show column names
print(data.columns)

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0            0.00   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldba

Data Preprocessing & Feature Engineering

Train-Test Split

In [2]:
import pandas as pd

# Load dataset with proper type handling
data = pd.read_csv("onlinefraud.csv", dtype={
    'newbalanceOrig': str,
    'type': str
})

# Clean numeric columns
numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
for col in numeric_cols:
    data[col] = data[col].replace(r'^\s*$', '0', regex=True).astype(float)

# Convert transaction types to numerical
data['type'] = data['type'].map({
    'CASH_OUT': 1,
    'PAYMENT': 2,
    'CASH_IN': 3,
    'TRANSFER': 4,
    'DEBIT': 5
}).astype(int)

# Add engineered features
data['balance_change'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['zero_balance_after'] = (data['newbalanceOrig'] == 0).astype(int)

print(data.head())
print("Dataset cleaned & ready for training! 🚀")


   step  type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1     2   9839.64  C1231006815       170136.0       160296.36   
1     1     2   1864.28  C1666544295        21249.0            0.00   
2     1     4    181.00  C1305486145          181.0            0.00   
3     1     1    181.00   C840083671          181.0            0.00   
4     1     2  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  \
0  M1979787155             0.0             0.0        0               0   
1  M2044282225             0.0             0.0        0               0   
2   C553264065             0.0             0.0        1               0   
3    C38997010         21182.0             0.0        1               0   
4  M1230701703             0.0             0.0        0               0   

   balance_change  zero_balance_after  
0         9839.64                   0  
1        21249.00                   1  
2 

In [3]:
from sklearn.model_selection import train_test_split

# Updated feature set with new features
features = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
            'oldbalanceDest', 'newbalanceDest', 'balance_change', 'zero_balance_after']

X = data[features]
y = data['isFraud']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {xtrain.shape}")
print(f"Test shape: {xtest.shape}")

Train shape: (5090096, 8)
Test shape: (1272524, 8)


Model Training (Random Forest)

In [None]:
from sklearn.ensemble import RandomForestClassifier
#it provides a powerful collection of ensemble methods that combine multiple machine learning 
# models to improve prediction accuracy and robustness, often outperforming individual models. 
from sklearn.impute import SimpleImputer

# Initialize and train model
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Train the model
model.fit(xtrain, ytrain)

# Evaluate the model
accuracy = model.score(xtest, ytest)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 1.00


Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix#a powerful tool for evaluating the performance of machine learning models


# Predict class labels
ypred = model.predict(xtest)

# Predict fraud probabilities
yprob = model.predict_proba(xtest)[:, 1]  # Probability of fraud (class 1)

# Show classification report
print(classification_report(ytest, ypred))

# Show confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(ytest, ypred))

# Show first 10 fraud probabilities
print("\nSample Fraud Probabilities:")
print(yprob[:10])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.96      0.80      0.87      1643

    accuracy                           1.00   1272524
   macro avg       0.98      0.90      0.94   1272524
weighted avg       1.00      1.00      1.00   1272524

Confusion Matrix:
[[1270824      57]
 [    324    1319]]

Sample Fraud Probabilities:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Fraud Prediction Example

In [6]:
import numpy as np
import pandas as pd

# Example transaction (Modify values as needed)
transaction_data = {
    'type': 4,               # CASH_OUT
    'amount': 181.00,
    'oldbalanceOrg': 181.00,
    'newbalanceOrig': 0.00,
    'oldbalanceDest': 0.00,
    'newbalanceDest': 0.00
}

# Calculate the additional features that were used in training
transaction_data['balance_change'] = transaction_data['oldbalanceOrg'] - transaction_data['newbalanceOrig']
transaction_data['zero_balance_after'] = 1 if transaction_data['newbalanceOrig'] == 0 else 0

# Create DataFrame with ALL features in the EXACT SAME ORDER as during training
feature_names = [
    'type', 
    'amount', 
    'oldbalanceOrg', 
    'newbalanceOrig',
    'oldbalanceDest', 
    'newbalanceDest',
    'balance_change',
    'zero_balance_after'
]

input_data = np.array([[
    transaction_data['type'],
    transaction_data['amount'],
    transaction_data['oldbalanceOrg'],
    transaction_data['newbalanceOrig'],
    transaction_data['oldbalanceDest'],
    transaction_data['newbalanceDest'],
    transaction_data['balance_change'],
    transaction_data['zero_balance_after']
]])

input_df = pd.DataFrame(input_data, columns=feature_names)

# Predict fraud or legit
prediction = model.predict(input_df)[0]
probability = model.predict_proba(input_df)[0][1]

# Print the result
print("Prediction:", "Fraud" if prediction == 1 else "Legit")
print(f"Fraud Probability: {probability * 100:.2f}%")

# Print warning flags if suspicious
if transaction_data['zero_balance_after']:
    print("⚠️ Warning: Account completely emptied")
if transaction_data['balance_change'] != transaction_data['amount']:
    print("⚠️ Warning: Discrepancy in amount and balance change")

Prediction: Fraud
Fraud Probability: 99.00%


Save the Trained Model

In [7]:
import joblib

# Save the trained model
joblib.dump(model, "fraud_detection_model.pkl")
print("✅ Model saved successfully!")

✅ Model saved successfully!
