<a href="https://colab.research.google.com/github/yashveersinghsohi/machine_hack_competitions/blob/od_pipeline/Data_Science_Student_Championship/Baseline_Model/Outlier_Detection_Pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Git Clone

In [2]:
# !git clone -b od_pipeline https://github.com/yashveersinghsohi/machine_hack_competitions.git

# Imports

In [6]:
# !pip install shap
# !pip install pyod

In [8]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_squared_log_error
import shap
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import LearningCurveDisplay, learning_curve, ShuffleSplit
from scipy.stats import ks_2samp
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
from sklearn.pipeline import Pipeline

# Data

In [12]:
root_dir = '/content/machine_hack_competitions/Data_Science_Student_Championship/Features/'

X_train = pd.read_csv(root_dir+'X_train.csv')
y_train = pd.read_csv(root_dir+'y_train.csv')

X_val = pd.read_csv(root_dir+'X_val.csv')
y_val = pd.read_csv(root_dir+'y_val.csv')

X_test = pd.read_csv(root_dir+'X_test.csv')
y_test = pd.read_csv(root_dir+'y_test.csv')

X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((167738, 20), (167738, 1), (41935, 20), (41935, 1), (89861, 20), (89861, 1))

# Helper Functions

In [10]:
def calculate_metrics(datasets, model):
  X, y = datasets
  mae = mean_absolute_error(y_true=y, y_pred=model.predict(X)).round(4)
  mse = mean_squared_error(y_true=y, y_pred=model.predict(X)).round(4)
  rmse = np.sqrt(mse).round(4)
  msle = mean_squared_log_error(y_true=y, y_pred=np.where(model.predict(X) < 0, 0, model.predict(X))).round(4)
  rmsle = np.sqrt(msle).round(4)
  return {'mae': mae, 'rmse': rmse, 'rmsle': rmsle}

In [23]:
def create_submission(
    model, X, 
    submission_path='/content/machine_hack_competitions/Data_Science_Student_Championship/Competition_Files/submission.csv',
    features = None, submission_name = ''
  ):
  X_copy = X.copy()
  X_copy['fare_pred'] = model.predict(X[features])
  submission_df = pd.read_csv(submission_path)
  submission_df['total_fare'] = X_copy['tip'] + X_copy['miscellaneous_fees'] + X_copy['fare_pred']
  submission_df['total_fare'] = np.where(submission_df['total_fare'] < 0, 0, submission_df['total_fare'])
  print(submission_df.head())
  submission_df.to_csv(submission_name, index=False)

# HBOS Outlier Detection

## Small XGB

### Defining Features

In [13]:
small_features = [
  'trip_duration', 'distance_traveled', 'num_of_passengers', 
  'tip', 'miscellaneous_fees', 'surge_applied'
]

### Outlier Detection

In [20]:
small_od = HBOS()
small_od.fit(X_train[small_features])

X_train['od'] = small_od.predict(X_train[small_features])
X_val['od'] = small_od.predict(X_val[small_features])

X_train['od_proba'] = small_od.predict_proba(X_train[small_features])[:, 1]
X_val['od_proba'] = small_od.predict_proba(X_val[small_features])[:, 1]

### Model with Outlier Predictions

In [21]:
small_xgb = XGBRegressor(random_state=42)
small_xgb.fit(X_train[small_features + ['od']], y_train.to_numpy().reshape(-1,))

In [22]:
train_datasets = X_train[small_features + ['od']], y_train.to_numpy().reshape(-1,)
val_datasets = X_val[small_features + ['od']], y_val.to_numpy().reshape(-1,)
calculate_metrics(datasets=train_datasets, model=small_xgb), calculate_metrics(datasets=val_datasets, model=small_xgb)

({'mae': 5.2328, 'rmse': 21.9511, 'rmsle': 0.1533},
 {'mae': 6.6846, 'rmse': 40.4919, 'rmsle': 0.19})

In [24]:
X_test_submission = X_test.copy()
X_test_submission['od'] = small_od.predict(X_test[small_features])
create_submission(
  model=small_xgb, 
  X=X_test_submission, features=small_features+['od'], 
  submission_name = 'HBOS_SMALL_XGB.csv'
)

   total_fare
0  110.671272
1   61.511314
2  116.142685
3   90.450958
4  117.048698


### Model with Outlier Probabilities

In [25]:
small_xgb_proba = XGBRegressor(random_state=42)
small_xgb_proba.fit(X_train[small_features + ['od_proba']], y_train.to_numpy().reshape(-1,))

In [26]:
train_datasets = X_train[small_features + ['od_proba']], y_train.to_numpy().reshape(-1,)
val_datasets = X_val[small_features + ['od_proba']], y_val.to_numpy().reshape(-1,)
calculate_metrics(datasets=train_datasets, model=small_xgb_proba), calculate_metrics(datasets=val_datasets, model=small_xgb_proba)

({'mae': 5.2079, 'rmse': 21.8594, 'rmsle': 0.1568},
 {'mae': 6.6321, 'rmse': 38.4963, 'rmsle': 0.1855})

In [27]:
X_test_submission = X_test.copy()
X_test_submission['od_proba'] = small_od.predict_proba(X_test[small_features])[:, 1]
create_submission(
  model=small_xgb_proba, 
  X=X_test_submission, features=small_features+['od_proba'], 
  submission_name = 'HBOS_SMALL_XGB_PROBA.csv'
)

   total_fare
0  111.141998
1   60.990650
2  116.260506
3   90.366325
4  114.572548


## Large XGB

### Defining Features

In [28]:
large_features = [
  'trip_duration', 'distance_traveled', 'num_of_passengers', 
  'tip', 'miscellaneous_fees', 'surge_applied', 
  'is_miscellaneous_fees_negative', 'is_miscellaneous_fees_0', 
  'is_tip_0', 'is_trip_duration_0'
]

### Outlier Detection

In [29]:
large_od = HBOS()
large_od.fit(X_train[large_features])

X_train['od'] = large_od.predict(X_train[large_features])
X_val['od'] = large_od.predict(X_val[large_features])

X_train['od_proba'] = large_od.predict_proba(X_train[large_features])[:, 1]
X_val['od_proba'] = large_od.predict_proba(X_val[large_features])[:, 1]

### Model with Outlier Predictions

In [30]:
large_xgb = XGBRegressor(random_state=42)
large_xgb.fit(X_train[large_features + ['od']], y_train.to_numpy().reshape(-1,))

In [31]:
train_datasets = X_train[large_features + ['od']], y_train.to_numpy().reshape(-1,)
val_datasets = X_val[large_features + ['od']], y_val.to_numpy().reshape(-1,)
calculate_metrics(datasets=train_datasets, model=large_xgb), calculate_metrics(datasets=val_datasets, model=large_xgb)

({'mae': 5.2308, 'rmse': 21.8251, 'rmsle': 0.1556},
 {'mae': 6.6845, 'rmse': 40.2073, 'rmsle': 0.1884})

In [32]:
X_test_submission = X_test.copy()
X_test_submission['od'] = large_od.predict(X_test[large_features])
create_submission(
  model=large_xgb, 
  X=X_test_submission, features=large_features+['od'], 
  submission_name = 'HBOS_LARGE_XGB.csv'
)

   total_fare
0  111.798424
1   61.519855
2  117.594780
3   90.918564
4  114.302513


### Model with Outlier Probabilities

In [33]:
large_xgb_proba = XGBRegressor(random_state=42)
large_xgb_proba.fit(X_train[large_features + ['od_proba']], y_train.to_numpy().reshape(-1,))

In [34]:
train_datasets = X_train[large_features + ['od_proba']], y_train.to_numpy().reshape(-1,)
val_datasets = X_val[large_features + ['od_proba']], y_val.to_numpy().reshape(-1,)
calculate_metrics(datasets=train_datasets, model=large_xgb_proba), calculate_metrics(datasets=val_datasets, model=large_xgb_proba)

({'mae': 5.1497, 'rmse': 21.1785, 'rmsle': 0.1523},
 {'mae': 6.7004, 'rmse': 39.6068, 'rmsle': 0.1873})

In [35]:
X_test_submission = X_test.copy()
X_test_submission['od_proba'] = large_od.predict_proba(X_test[large_features])[:, 1]
create_submission(
  model=large_xgb_proba, 
  X=X_test_submission, features=large_features+['od_proba'], 
  submission_name = 'HBOS_LARGE_XGB_PROBA.csv'
)

   total_fare
0  112.308945
1   62.163891
2  118.267426
3   90.902214
4  117.474068
