In [21]:
import pandas as pd
import json
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample

# --------------------------
# 🔹 Load Historical & Real-Time Data
# --------------------------

historical_df = pd.read_csv("historical_reconciliation.csv")
real_time_df = pd.read_csv("real_time_transaction.csv")

# Standardize column names (fix extra spaces issue)
historical_df.columns = historical_df.columns.str.strip()
real_time_df.columns = real_time_df.columns.str.strip()

# --------------------------
# 🔹 Load & Normalize Feedback Data
# --------------------------

with open("../ui/feedback.json", "r") as f:
    feedback = json.load(f)

feedback_df = pd.DataFrame(feedback)
feedback_features = pd.json_normalize(feedback_df["input"], errors="ignore")
feedback_features["label"] = feedback_df["feedback"].apply(lambda x: 1 if x == "Yes" else 0)

# --------------------------
# 🔹 Ensure All Required Columns Exist & Encode Categoricals
# --------------------------

categorical_features = ["Primary Account", "Secondary Account", "Currency"]
required_features = ["Balance Difference"] + categorical_features

# Apply Label Encoding for categorical columns
label_encoders = {}
for col in categorical_features:
    le = LabelEncoder()
    combined_data = pd.concat([historical_df[col], real_time_df[col], feedback_features[col]], axis=0)
    le.fit(combined_data.astype(str))
    historical_df[col] = le.transform(historical_df[col].astype(str))
    real_time_df[col] = le.transform(real_time_df[col].astype(str))
    feedback_features[col] = le.transform(feedback_features[col].astype(str))
    label_encoders[col] = le

# --------------------------
# 🔹 Correct Anomaly Labeling Rule
# --------------------------

historical_df["label"] = historical_df["Balance Difference"].apply(lambda x: 0 if abs(x) < 1 else (1 if abs(x) > 10 else 0))
real_time_df["label"] = real_time_df["Balance Difference"].apply(lambda x: 0 if abs(x) < 1 else (1 if abs(x) > 10 else 0))

# --------------------------
# 🔹 Merge Data Sources
# --------------------------

full_data = pd.concat([
    historical_df[required_features + ["label"]],
    real_time_df[required_features + ["label"]],
    feedback_features
], ignore_index=True)

# --------------------------
# 🔹 Fix: Balance Data by Slightly Oversampling Anomalies
# --------------------------

normal_df = full_data[full_data["label"] == 0]
anomaly_df = full_data[full_data["label"] == 1]

# Slightly oversample anomalies if needed
if len(anomaly_df) < len(normal_df):
    anomaly_df = resample(anomaly_df, replace=True, n_samples=len(normal_df), random_state=42)

# Merge and shuffle the dataset
full_data = pd.concat([normal_df, anomaly_df])
full_data = full_data.sample(frac=1, random_state=42).reset_index(drop=True)

X_resampled, y_resampled = full_data.drop("label", axis=1), full_data["label"]

# --------------------------
# 🔹 Train the Machine Learning Model with Adjusted Class Weights
# --------------------------

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

model = RandomForestClassifier(
    n_estimators=200,  
    max_depth=10,  
    min_samples_split=5,  
    min_samples_leaf=2,  
    class_weight={0: 3, 1: 2},  # ✅ More weight to normal transactions to fix bias
    random_state=42
)
model.fit(X_train, y_train)

# Evaluate model performance
print("📊 Model Performance:")
print(classification_report(y_test, model.predict(X_test)))

# --------------------------
# 🔹 Save New Model
# --------------------------

joblib.dump(model, "model.pkl")
print("✅ New model trained and saved as model.pkl")


📊 Model Performance:
              precision    recall  f1-score   support

           0       1.00      0.60      0.75         5
           1       0.78      1.00      0.88         7

    accuracy                           0.83        12
   macro avg       0.89      0.80      0.81        12
weighted avg       0.87      0.83      0.82        12

✅ New model trained and saved as model.pkl
