Data Loading & Initial Cleaning

In [12]:
import pandas as pd

# Load dataset with explicit dtype specification to handle mixed types
data = pd.read_csv("onlinefraud.csv", dtype={'newbalanceOrig': str})  # Treat as string initially

# Convert empty strings to 0 and then to float
data['newbalanceOrig'] = data['newbalanceOrig'].replace(r'^\s*$', '0', regex=True).astype(float)

# Show first few rows
print(data.head())

# Show column names
print(data.columns)

   step      type    amount     nameOrig  oldbalanceOrg  newbalanceOrig  \
0     1   PAYMENT   9839.64  C1231006815       170136.0       160296.36   
1     1   PAYMENT   1864.28  C1666544295        21249.0            0.00   
2     1  TRANSFER    181.00  C1305486145          181.0            0.00   
3     1  CASH_OUT    181.00   C840083671          181.0            0.00   
4     1   PAYMENT  11668.14  C2048537720        41554.0        29885.86   

      nameDest  oldbalanceDest  newbalanceDest  isFraud  isFlaggedFraud  
0  M1979787155             0.0             0.0        0               0  
1  M2044282225             0.0             0.0        0               0  
2   C553264065             0.0             0.0        1               0  
3    C38997010         21182.0             0.0        1               0  
4  M1230701703             0.0             0.0        0               0  
Index(['step', 'type', 'amount', 'nameOrig', 'oldbalanceOrg', 'newbalanceOrig',
       'nameDest', 'oldba

Data Preprocessing & Feature Engineering

Train-Test Split

In [None]:
import pandas as pd

# Load dataset with proper type handling
data = pd.read_csv("onlinefraud.csv", dtype={
    'newbalanceOrig': str,
    'type': str
})

# Clean numeric columns
numeric_cols = ['amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
for col in numeric_cols:
    data[col] = data[col].replace(r'^\s*$', '0', regex=True).astype(float)

# Convert transaction types to numerical
data['type'] = data['type'].map({
    'CASH_OUT': 1,
    'PAYMENT': 2,
    'CASH_IN': 3,
    'TRANSFER': 4,
    'DEBIT': 5
}).astype(int)

# Add engineered features
data['balance_change'] = data['oldbalanceOrg'] - data['newbalanceOrig']
data['zero_balance_after'] = (data['newbalanceOrig'] == 0).astype(int)

print(data.head())
print("Dataset cleaned & ready for training! 🚀")


   type    amount  oldbalanceOrg  newbalanceOrig  oldbalanceDest  \
0     2   9839.64       170136.0       160296.36             0.0   
1     2   1864.28        21249.0            0.00             0.0   
2     4    181.00          181.0            0.00             0.0   
3     1    181.00          181.0            0.00         21182.0   
4     2  11668.14        41554.0        29885.86             0.0   

   newbalanceDest  isFraud  isFlaggedFraud  
0             0.0        0               0  
1             0.0        0               0  
2             0.0        1               0  
3             0.0        1               0  
4             0.0        0               0  
Dataset cleaned & ready for training! 🚀


In [16]:
from sklearn.model_selection import train_test_split

# Updated feature set with new features
features = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 
            'oldbalanceDest', 'newbalanceDest', 'balance_change', 'zero_balance_after']

X = data[features]
y = data['isFraud']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print(f"Train shape: {xtrain.shape}")
print(f"Test shape: {xtest.shape}")

KeyError: "['balance_change', 'zero_balance_after'] not in index"

Model Training (Random Forest)

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer

# Initialize and train model
model = RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42)

# Train the model
model.fit(xtrain, ytrain)

# Evaluate the model
accuracy = model.score(xtest, ytest)
print(f"Model Accuracy: {accuracy:.2f}")

ValueError: could not convert string to float: 'CASH_IN'

Model Evaluation

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

# Predict class labels
ypred = model.predict(xtest)

# Predict fraud probabilities
yprob = model.predict_proba(xtest)[:, 1]  # Probability of fraud (class 1)

# Show classification report
print(classification_report(ytest, ypred))

# Show confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(ytest, ypred))

# Show first 10 fraud probabilities
print("\nSample Fraud Probabilities:")
print(yprob[:10])

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270904
           1       0.97      0.79      0.87      1620

    accuracy                           1.00   1272524
   macro avg       0.98      0.89      0.93   1272524
weighted avg       1.00      1.00      1.00   1272524

Confusion Matrix:
[[1270858      46]
 [    343    1277]]

Sample Fraud Probabilities:
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


Fraud Prediction Example

In [None]:
import numpy as np
import pandas as pd

# Example transaction (Modify values as needed)
input_data = np.array([[4, 6215.44, 6215.44, 0.0, 0.0, 0.0]])

# Convert to DataFrame with the correct feature names
feature_names = ['type', 'amount', 'oldbalanceOrg', 'newbalanceOrig', 'oldbalanceDest', 'newbalanceDest']
input_df = pd.DataFrame(input_data, columns=feature_names)

# Predict fraud or legit
prediction = model.predict(input_df)[0]
probability = model.predict_proba(input_df)[0][1]

# Print the result
print("Prediction:", "Fraud" if prediction == 1 else "Legit")
print(f"Fraud Probability: {probability * 100:.2f}%")

Prediction: Fraud
Fraud Probability: 99.00%


Save the Trained Model

In [None]:
import joblib

# Save the trained model
joblib.dump(model, "fraud_detection_model.pkl")
print("✅ Model saved successfully!")

✅ Model saved successfully!


: 