In [None]:
!pip install pyod
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# PyOD models
from pyod.models.iforest import IForest
from pyod.models.ocsvm import OCSVM
from pyod.models.knn import KNN
from pyod.models.hbos import HBOS
from pyod.models.auto_encoder import AutoEncoder

# -------------------------------
# 1. Load and Clean Data
# -------------------------------
data = pd.read_csv("train.csv", usecols=["fare_amount"], nrows=100000)
data.dropna(inplace=True)
data = data[data["fare_amount"] > 0]

# -------------------------------
# 2. Create Proxy Ground Truth using Modified Z-score
# -------------------------------
median = data["fare_amount"].median()
mad = np.median(np.abs(data["fare_amount"] - median)) or 1e-6
data["mod_z"] = 0.6745 * (data["fare_amount"] - median) / mad
data["true_label"] = (data["mod_z"].abs() > 3.5).astype(int)  # 1 = anomaly

# -------------------------------
# 3. Prepare Data for Modeling
# -------------------------------
X = data[["fare_amount"]].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# -------------------------------
# 4. Define Models
# -------------------------------
models = {
    "IsolationForest": IForest(contamination=0.01, random_state=42),
    "OneClassSVM": OCSVM(contamination=0.01),
    "KNN": KNN(contamination=0.01),
    "HBOS": HBOS(contamination=0.01),
    "AutoEncoder": AutoEncoder(contamination=0.01, verbose=0)

}

# -------------------------------
# 5. Fit and Predict
# -------------------------------
results = {}
# -------------------------------
# 5. Fit and Predict
# -------------------------------
results = {}
for name, model in models.items():
    if name == "AutoEncoder":
        model.fit(X_scaled, epochs=20, batch_size=32, verbose=0)
    else:
        model.fit(X_scaled)
    
    preds = model.predict(X_scaled)  # 0 = normal, 1 = anomaly
    data[f"{name}_label"] = preds
    print(f"\n--- {name} ---")
    print(classification_report(data["true_label"], preds, digits=4))
    results[name] = classification_report(data["true_label"], preds, output_dict=True)

# -------------------------------
# 6. Optional: Confusion Matrix Display
# -------------------------------
for name in models.keys():
    cm = confusion_matrix(data["true_label"], data[f"{name}_label"])
    print(f"\nConfusion Matrix - {name}:\n{cm}")

# -------------------------------
# 7. Optional: Visualize Detection
# -------------------------------
plt.figure(figsize=(12, 6))
sns.scatterplot(x=data.index, y=data["fare_amount"], hue=data["IsolationForest_label"], palette=["blue", "red"])
plt.title("Anomalies by Isolation Forest")
plt.show()

plt.figure(figsize=(12, 6))
sns.scatterplot(x=data.index, y=data["fare_amount"], hue=data["KNN_label"], palette=["green", "orange"])
plt.title("Anomalies by KNN")
plt.show()




--- IsolationForest ---
              precision    recall  f1-score   support

           0     0.9327    1.0000    0.9652     92334
           1     1.0000    0.1292    0.2289      7654

    accuracy                         0.9333     99988
   macro avg     0.9663    0.5646    0.5970     99988
weighted avg     0.9378    0.9333    0.9088     99988

