<a href="https://colab.research.google.com/github/yashveersinghsohi/machine_hack_competitions/blob/feature_engineering/Data_Science_Student_Championship/Features/Feature_Engineering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Git Clone

In [1]:
# !git clone -b feature_engineering https://github.com/yashveersinghsohi/machine_hack_competitions.git

# Imports

In [2]:
# !pip install pyod

In [3]:
import pandas as pd
import numpy as np
from pyod.models.iforest import IForest
from pyod.models.cblof import CBLOF
from pyod.models.hbos import HBOS
from pyod.models.abod import ABOD
import warnings
warnings.filterwarnings('ignore')

# Datasets

In [4]:
root_dir = '/content/machine_hack_competitions/Data_Science_Student_Championship/Features/raw/'

X_train_raw = pd.read_csv(root_dir + 'X_train_raw.csv')
y_train_raw = pd.read_csv(root_dir + 'y_train_raw.csv')

X_val_raw = pd.read_csv(root_dir + 'X_val_raw.csv')
y_val_raw = pd.read_csv(root_dir + 'y_val_raw.csv')

X_test_raw = pd.read_csv(root_dir + 'X_test_raw.csv')
y_test_raw = pd.read_csv(root_dir + 'y_test_raw.csv')

In [5]:
X_train, y_train = X_train_raw.copy(), y_train_raw.copy()
X_val, y_val = X_val_raw.copy(), y_val_raw.copy()
X_test, y_test = X_test_raw.copy(), y_test_raw.copy()

# Feature Engineering

## Helper Function

In [6]:
def iqr_univariate_od(datasets, col):
  X_train, X_val, X_test = datasets
  q25 = X_train[col].quantile(0.25)
  q75 = X_train[col].quantile(0.75)
  iqr = q75-q25
  lower_bound = q25 - (1.5*iqr)
  upper_bound = q75 + (1.5*iqr)
  X_train['is_'+col+'_outlier'] = ((X_train[col] > upper_bound) | (X_train[col] < lower_bound)).astype(int)
  X_val['is_'+col+'_outlier'] = ((X_val[col] > upper_bound) | (X_val[col] < lower_bound)).astype(int)
  X_test['is_'+col+'_outlier'] = ((X_test[col] > upper_bound) | (X_test[col] < lower_bound)).astype(int)
  return [X_train, X_val, X_test]

def pyod_od(datasets, model):
  X_train, X_val, X_test = datasets
  model.fit(X_train)
  train_preds = model.predict(X_train)
  val_preds = model.predict(X_val)
  test_preds = model.predict(X_test)
  return train_preds, val_preds, test_preds

def run_pyod_od_models(datasets, models, model_names):
  X_train, X_val, X_test = datasets
  for model, model_name in zip(models, model_names):
    train_outliers, val_outliers, test_outliers = pyod_od(datasets, model)
    X_train[f'{model_name}_is_row_outlier'] = train_outliers
    X_val[f'{model_name}_is_row_outlier'] = val_outliers
    X_test[f'{model_name}_is_row_outlier'] = test_outliers
  return X_train, X_val, X_test

## Outlier Related Features

In [7]:
datasets = [X_train_raw, X_val_raw, X_test_raw]

In [8]:
od_models = [IForest(random_state=42), CBLOF(random_state=42), HBOS(), ABOD()]
od_model_names = ['iforest', 'cblof', 'hbos', 'abod']
datasets = run_pyod_od_models(datasets=datasets, models=od_models, model_names=od_model_names)

In [9]:
continuous_features = ['trip_duration', 'distance_traveled', 'tip', 'miscellaneous_fees']
for feature in continuous_features:
  datasets = iqr_univariate_od(datasets=datasets, col=feature)

## Edge Cases and Bucketing Features

In [10]:
X_train, X_val, X_test = datasets
X_train['is_miscellaneous_fees_negative'] = np.where(X_train['miscellaneous_fees'] < 0, 1, 0)
X_val['is_miscellaneous_fees_negative'] = np.where(X_val['miscellaneous_fees'] < 0, 1, 0)
X_test['is_miscellaneous_fees_negative'] = np.where(X_test['miscellaneous_fees'] < 0, 1, 0)


X_train['is_miscellaneous_fees_0'] = np.where(X_train['miscellaneous_fees'] == 0, 1, 0)
X_val['is_miscellaneous_fees_0'] = np.where(X_val['miscellaneous_fees'] == 0, 1, 0)
X_test['is_miscellaneous_fees_0'] = np.where(X_test['miscellaneous_fees'] == 0, 1, 0)


X_train['is_tip_0'] = np.where(X_train['tip'] == 0, 1, 0)
X_val['is_tip_0'] = np.where(X_val['tip'] == 0, 1, 0)
X_test['is_tip_0'] = np.where(X_test['tip'] == 0, 1, 0)


X_train['is_trip_duration_0'] = np.where(X_train['trip_duration'] == 0, 1, 0)
X_val['is_trip_duration_0'] = np.where(X_val['trip_duration'] == 0, 1, 0)
X_test['is_trip_duration_0'] = np.where(X_test['trip_duration'] == 0, 1, 0)


X_train['can_fit_in_sedan'] = np.where(X_train['num_of_passengers']<=4, 1, 0)
X_train['can_fit_in_suv'] = np.where(X_train['num_of_passengers']<=6, 1, 0)

X_val['can_fit_in_sedan'] = np.where(X_val['num_of_passengers']<=4, 1, 0)
X_val['can_fit_in_suv'] = np.where(X_val['num_of_passengers']<=6, 1, 0)

X_test['can_fit_in_sedan'] = np.where(X_test['num_of_passengers']<=4, 1, 0)
X_test['can_fit_in_suv'] = np.where(X_test['num_of_passengers']<=6, 1, 0)

# Saving Preprocessed Features

In [11]:
X_train.to_csv('X_train.csv', index=False)
X_val.to_csv('X_val.csv', index=False)
X_test.to_csv('X_test.csv', index=False)

y_train_raw.to_csv('y_train.csv', index=False)
y_val_raw.to_csv('y_val.csv', index=False)
y_test_raw.to_csv('y_test.csv', index=False)