<a href="https://colab.research.google.com/github/therakeshjoshi/Blockchain-Integrated-Machine-Learning-System-for-Cryptocurrency-Fraud-Detection/blob/main/training_the_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [60]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_auc_score
import joblib
sns.set(style="whitegrid")
print("Libraries Installed and Imported Successfully.")

Libraries Installed and Imported Successfully.


In [61]:
df = pd.read_csv('/content/first_order_df.csv')

print(df.shape)
df.head()

(254973, 8)


Unnamed: 0.1,Unnamed: 0,TxHash,BlockHeight,TimeStamp,From,To,Value,isError
0,0,0xaca3850ba0080cf47b47f80e46da452f61bcbb5470d3...,5848095,1529873859,0x16f209b5332a1b4fa5bf19497ca40154c5db2f85,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.5,0
1,1,0x95681862f9778e49caecf603dd911d6ed57f7799d89d...,5848181,1529875104,0xe7e07e44ee315b5f2d076340b2b7a5cc9a4ee57b,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.00102,0
2,2,0x716ae3961b50186a0bbc272cfcc4555662f7fe33550f...,5848716,1529883192,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0xe892875b87b94c44edf0e91ee9f49d0525fadd83,0.50039,0
3,3,0xf397197b800d6cc055a4db265b5e9df3dd2aa745c813...,5849038,1529887684,0x0681d8db095565fe8a346fa0277bffde9c0edbbf,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0.8178,0
4,4,0x7f8086011a32f128dba57fe06fc5f4a181d2f5401e5a...,5849437,1529893144,0x002f0c8119c16d310342d869ca8bf6ace34d9c39,0xe892875b87b94c44edf0e91ee9f49d0525fadd83,0.817506,0


In [62]:
drop_cols = ['Index', 'Address', 'TxHash', 'From', 'To', 'Unnamed: 0']


existing_drop_cols = [col for col in drop_cols if col in df.columns]
df_clean = df.drop(columns=existing_drop_cols)

In [63]:
df_clean

Unnamed: 0,BlockHeight,TimeStamp,Value,isError
0,5848095,1529873859,0.500000,0
1,5848181,1529875104,0.001020,0
2,5848716,1529883192,0.500390,0
3,5849038,1529887684,0.817800,0
4,5849437,1529893144,0.817506,0
...,...,...,...,...
254968,6101678,1533603040,0.700000,0
254969,6104069,1533638323,0.770000,0
254970,6104111,1533639008,0.500000,0
254971,6104810,1533649370,0.330000,0


In [64]:
for col in df_clean.select_dtypes(include=['object']).columns:
    print(f"Dropping complex non-numeric column: {col}")
    df_clean = df_clean.drop(columns=[col])

In [65]:
df_clean.fillna(0, inplace=True)

In [66]:
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(threshold=0)
selector.fit(df_clean)

In [67]:
valid_cols = df_clean.columns[selector.get_support()]
df_final = df_clean[valid_cols]

In [68]:
print(f"Data Cleaned. Shape reduced from {df.shape} to {df_final.shape}")

Data Cleaned. Shape reduced from (254973, 8) to (254973, 4)


In [69]:
target_col = 'isError'

In [70]:
if target_col not in df_final.columns:
    raise ValueError(f"Target column '{target_col}' not found! Please check your CSV column names.")

In [71]:
X = df_final.drop(columns=[target_col])
y = df_final[target_col]

In [72]:
fraud_count = y.value_counts()[1]
legit_count = y.value_counts()[0]
print(f"Legit Transactions: {legit_count}")
print(f"Fraud Transactions: {fraud_count}")
print(f"Fraud Percentage: {(fraud_count / len(y)) * 100:.2f}%")

Legit Transactions: 239339
Fraud Transactions: 15634
Fraud Percentage: 6.13%


In [73]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scale_pos_weight = legit_count / fraud_count

print(f"Training Features Shape: {X_train.shape}")
print(f"Calculated Scale Weight: {scale_pos_weight:.2f}")

Training Features Shape: (203978, 3)
Calculated Scale Weight: 15.31


In [74]:
X_train

Unnamed: 0,BlockHeight,TimeStamp,Value
168004,5331622,1522160807,0.056211
214174,4034321,1500285832,7.800000
212540,6668921,1541717634,6.000000
236102,6672686,1541771106,0.002000
11655,5211292,1520406766,2.000000
...,...,...,...
158824,5866932,1530157615,0.062242
25394,4407884,1508670839,0.000000
144652,4466283,1509484665,0.000000
157273,5727089,1528058883,2.000000


In [75]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

# 1. Define the Models to Contest
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
    "XGBoost": xgb.XGBClassifier(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        scale_pos_weight=scale_pos_weight, # Using the weight we calculated earlier
        eval_metric='logloss',
        use_label_encoder=False,
        random_state=42
    )
}

# 2. Train and Evaluate Loop
results = []

print(f"{'Model':<20} | {'Accuracy':<10} | {'Precision':<10} | {'Recall':<10} | {'F1-Score':<10} | {'AUC-ROC':<10}")
print("-" * 85)

best_model_name = ""
best_f1 = 0
best_model_obj = None

for name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)
    y_prob = model.predict_proba(X_test)[:, 1]

    # Calculate Metrics
    acc = accuracy_score(y_test, y_pred)
    prec = precision_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_prob)

    # Print Row
    print(f"{name:<20} | {acc:.4f}     | {prec:.4f}     | {rec:.4f}     | {f1:.4f}     | {auc:.4f}")

    # Track Winner (based on F1-Score which balances Precision & Recall)
    if f1 > best_f1:
        best_f1 = f1
        best_model_name = name
        best_model_obj = model

print("-" * 85)
print(f"The Winner is: {best_model_name} with F1-Score: {best_f1:.4f}")

Model                | Accuracy   | Precision  | Recall     | F1-Score   | AUC-ROC   
-------------------------------------------------------------------------------------
Logistic Regression  | 0.6941     | 0.1308     | 0.7067     | 0.2208     | 0.7335
Random Forest        | 0.9743     | 0.7272     | 0.9303     | 0.8163     | 0.9729


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost              | 0.8429     | 0.2698     | 0.9153     | 0.4167     | 0.9499
-------------------------------------------------------------------------------------
The Winner is: Random Forest with F1-Score: 0.8163


In [76]:
import joblib
from sklearn.ensemble import RandomForestClassifier

rf_model = RandomForestClassifier(
    n_estimators=100,
    class_weight='balanced',
    random_state=42
)

print("Training Random Forest...")
rf_model.fit(X_train, y_train)
print("Training Complete.")

model_filename = 'random_forest_fraud_model.pkl'
joblib.dump(rf_model, model_filename)

print(f" Model successfully saved as: {model_filename}")

try:
    from google.colab import files
    files.download(model_filename)
    print("Download started...")
except ImportError:
    print("Not running on Colab? Check your local directory for the file.")

Training Random Forest...
Training Complete.
 Model successfully saved as: random_forest_fraud_model.pkl


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Download started...
