# ML Pipeline

In [2]:
# Import necessary packages
import pandas as pd
import numpy as np
import mlflow
from datetime import datetime
from scipy import stats

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn import svm
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.pipeline import Pipeline, make_pipeline

from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

from sklearn.compose import ColumnTransformer

import category_encoders as ce

from sklearn.impute import SimpleImputer

In [4]:
df_clean = pd.read_csv('C:/Users/4YouSee/Desktop/personal_work/rental-prices-ny/components/basic_clean/df_clean.csv', low_memory=False)
df_clean.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,number_of_reviews_ltm,license
0,718031653455200639,Cozy Work from Home Studio in Upper East Side,2867137,Avi,Manhattan,Upper East Side,40.76939,-73.95498,Entire home/apt,150,30,0,,,80,180,0,
1,53570786,NEW Renovated room and bathroom. (2FL),19303369,Hiroki,Queens,Elmhurst,40.74515,-73.87187,Private room,39,30,1,2022-10-09,0.52,196,0,1,
2,2142092,"Furnished room - W. 181 St. by A, 1",8280182,Alejandro,Manhattan,Washington Heights,40.85098,-73.93664,Private room,300,30,0,,,1,0,0,
3,26916746,Brooklyn Home,193502084,Linda,Brooklyn,Borough Park,40.64045,-74.00404,Private room,40,30,26,2019-03-20,0.49,8,0,0,
4,74333,Alcove Studio w/ outdoor Patio Deck,331328,Amir,Manhattan,East Harlem,40.80834,-73.94075,Entire home/apt,100,30,39,2022-10-04,0.29,2,347,3,


## Clean and Preprocessing Data

In [None]:
# categorical values
ordinal_categorical = ['room_type']
non_ordinal_categorical = ['neighbourhood_group']

# numerical values
zero_imputed = [
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365']

# select only the features that we are going to use
X = df_clean.drop(['price'], axis=1)
y = df_clean['price']

# categorical preprocessing
ordinal_categorical_preproc = ce.OrdinalEncoder(
    cols = ordinal_categorical, 
    mapping = [
        {'col':'room_type',
         'mapping':{'Shared room':0,
                    'Private room':1,
                    'Entire home/apt':2,
                    'Hotel room':3}}])
    
non_ordinal_categorical_preproc = make_pipeline(
        SimpleImputer(strategy='most_frequent'),
        OneHotEncoder(drop='first'))

# numerical preprocessing
zero_imputer = SimpleImputer(strategy='constant', fill_value=0)

# apply the respective transformations with columntransformer method
preprocessor = ColumnTransformer([
    ('ordinal_cat', ordinal_categorical_preproc, ordinal_categorical),
    ('non_ordinal_cat', non_ordinal_categorical_preproc, non_ordinal_categorical),
    ('impute_zero', zero_imputer, zero_imputed)],
    remainder='drop')

## Training and Model Selection

In [None]:
def run_regressor_models(X, y, cv, scoring):
    '''Function that trains the following machine learning models:
    Linear regression, elastic net, decision tree, SVM, gradient boosting and
    random forest. The function applies cross-validation on the dataset and
    returns the mean of the selected metric in the training and validation set.
    The only active metrics are the root mean squared error and R². The 
    experiments with models are tracked by MLflow.
    
    :param X: (dataframe or numpy array) 
    Dataframe or array with set of independent variables.
    
    :param y: (series or numpy array)
    Column or vector array with the dependent variable.
    
    :param cv: (int)
    Determines the cross-validation split strategy. Number of k-folds.
    
    :param scoring: (str)
    Strategy to evaluate the performance of the cross-validation model in the 
    validation set. Must be passed in quotes when calling the function.
    '''
    # 1. Instantiate the models
    lr = LinearRegression()
    elastic_net = ElasticNet()
    tree = DecisionTreeRegressor()
    svm_reg = svm.SVR()
    gbr = GradientBoostingRegressor()
    rf = RandomForestRegressor()

    # 2. train and evaluate the models
    for model in (lr, elastic_net, tree, svm_reg, gbr, rf):
        pipe = Pipeline(
            steps=[('preprocessor', preprocessor),
                   ('scaling', StandardScaler()),
                   ('regressor', model)
                  ]
                )
        scores = cross_validate(pipe, X, y, return_train_score=True,
                                scoring=scoring, cv=cv)

        # train and validation with rmse
        if scoring == 'neg_mean_squared_error':
            train_rmse_scores = np.sqrt(-scores['train_score'])
            test_rmse_scores = np.sqrt(-scores['test_score'])

            # track the experiment with rmse
       

        # train and validation with r2
        if scoring == 'r2':
            train_r2_scores = np.mean(scores['train_score'])
            test_r2_scores = np.mean(scores['test_score'])

            # track the experiment with r2
        