In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.impute import SimpleImputer
import joblib

# Load dataset
df = pd.read_csv("synthetic_mobile_money_transaction_dataset.csv")

# Data preprocessing
df.dropna(subset=['isFraud'], inplace=True)
df['transactionType'] = df['transactionType'].astype('category').cat.codes
df['initiator'] = df['initiator'].astype('category').cat.codes
df['recipient'] = df['recipient'].astype('category').cat.codes

for col in ['amount', 'oldBalRecipient', 'newBalRecipient']:
    df[col] = np.log1p(df[col])

features = ["step", "initiator", "recipient", "transactionType", "amount", "oldBalInitiator", "newBalInitiator", "oldBalRecipient", "newBalRecipient"]
target = "isFraud"

X = df[features]
y = df[target]

imputer = SimpleImputer(strategy='median')
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Step 1: Train the Teacher Model (Random Forest)
teacher_model = RandomForestClassifier(n_estimators=100, random_state=42)
teacher_model.fit(X_train_scaled, y_train)

# Get Teacher Predictions (Soft Labels)
teacher_train_preds = teacher_model.predict_proba(X_train_scaled)[:, 1]  # Probabilities for class 1
teacher_test_preds = teacher_model.predict_proba(X_test_scaled)[:, 1]

# Binarize the teacher's soft labels using a threshold (e.g., 0.5)
teacher_train_labels = (teacher_train_preds >= 0.5).astype(int)
teacher_test_labels = (teacher_test_preds >= 0.5).astype(int)

# Step 2: Train the Student Model (Decision Tree) using Binarized Labels
student_model = DecisionTreeClassifier(random_state=42)
student_model.fit(X_train_scaled, teacher_train_labels)

# Evaluate the Student Model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=1)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    return accuracy, precision, recall, f1

# Initial Evaluation
accuracy, precision, recall, f1 = evaluate_model(student_model, X_test_scaled, y_test)
print(f"Initial Student Model (Decision Tree) Evaluation:")
print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}")

# Step 3: Reinforcement Learning (Optional)
# Use F1 score as a reward to further refine the student model
best_f1 = f1
best_student_model = student_model

# Experiment with different hyperparameters
results = []
for max_depth in [None, 5, 10, 20]:  # Experiment with different max_depth values
    student_model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
    student_model.fit(X_train_scaled, teacher_train_labels)
    accuracy, precision, recall, f1 = evaluate_model(student_model, X_test_scaled, y_test)
    results.append({
        "max_depth": max_depth,
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1_score": f1
    })
    if f1 > best_f1:
        best_f1 = f1
        best_student_model = student_model

# Display Results in a Table
results_df = pd.DataFrame(results)
print("\nModel Evaluation Results:")
print(results_df)

# Save the Best Student Model
joblib.dump(best_student_model, "best_student_model.joblib")
print("\nBest Student Model saved as 'best_student_model.joblib'")

# Model Selection and Justification
print("\nModel Selection:")
print("The best-performing model was selected based on the F1 score, which balances precision and recall.")
print("The following factors were considered:")
print("- **Accuracy**: Overall correctness of the model.")
print("- **Precision**: Proportion of correctly predicted fraud cases out of all predicted fraud cases.")
print("- **Recall**: Proportion of actual fraud cases correctly identified.")
print("- **F1 Score**: Harmonic mean of precision and recall, providing a balanced measure.")
print("- **Computational Efficiency**: Decision Trees are lightweight and fast, making them suitable for real-time fraud detection.")
print("- **Interpretability**: Decision Trees are easy to interpret, which is crucial for explaining predictions to stakeholders.")

print("\nTrade-offs:")
print("- **Accuracy vs. Interpretability**: While Random Forest (Teacher) has higher accuracy, the Decision Tree (Student) is more interpretable.")
print("- **Precision vs. Recall**: Depending on the problem context, we may prioritize recall (catching all fraud cases) over precision (minimizing false positives).")

print("\nFinal Decision:")
print("The best model was chosen based on its F1 score and computational efficiency. The Decision Tree model strikes a good balance between performance and interpretability, making it suitable for real-world deployment.")

  result = getattr(ufunc, method)(*inputs, **kwargs)


Initial Student Model (Decision Tree) Evaluation:
Accuracy: 0.9603, Precision: 0.9611, Recall: 0.9639, F1 Score: 0.9625

Model Evaluation Results:
   max_depth  accuracy  precision    recall  f1_score
0        NaN  0.960273   0.961059  0.963880  0.962467
1        5.0  0.968722   0.989379  0.951022  0.969821
2       10.0  0.969375   0.990646  0.951029  0.970433
3       20.0  0.970382   0.993800  0.949880  0.971344

Best Student Model saved as 'best_student_model.joblib'

Model Selection:
The best-performing model was selected based on the F1 score, which balances precision and recall.
The following factors were considered:
- **Accuracy**: Overall correctness of the model.
- **Precision**: Proportion of correctly predicted fraud cases out of all predicted fraud cases.
- **Recall**: Proportion of actual fraud cases correctly identified.
- **F1 Score**: Harmonic mean of precision and recall, providing a balanced measure.
- **Computational Efficiency**: Decision Trees are lightweight and fa