# selecting best model with best hyper parameters

In [1]:
# importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# train test split the data 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
# import regression algorithm
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessors
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [2]:
# load the dataset
df= sns.load_dataset('tips')

In [3]:
df.head()


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [4]:
# select feature and vareiable
X= df.drop('tip', axis=1)
y= df['tip']

# label encode catagirial variables

le= LabelEncoder()
X['sex']= le.fit_transform(X['sex'])
X['smoker']= le.fit_transform(X['smoker'])
X['day']= le.fit_transform(X['day'])
X['time']= le.fit_transform(X['time'])


# spllit the data into train and test data with 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# creatae a dictionary of list of models to evaluate performance 
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# train and predict each model with evaluation metrics as well making for loop
model_score= []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    # make predictions from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_score.append((name, metric))


    # here also can perform another metrics
    # r2 = r2_score(y_test, y_pred)
    # model_score.append((name, metric, r2))    
# same thing can be performed by r2, mse, mae etc.



# selecting the best model from all above models with evaluation metrics sorting
sorted_models= sorted(model_score, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean absolute errror:', model[0], model[1])

Mean absolute errror: SVR 0.5707097371316318
Mean absolute errror: Linear Regression 0.6703807496461157
Mean absolute errror: XGBoost 0.6721697168934103
Mean absolute errror: KNeighborsRegressor 0.7262448979591837
Mean absolute errror: GradientBoostingRegressor 0.7352952798016953
Mean absolute errror: Random Forest 0.7776897959183675
Mean absolute errror: Decision Tree 0.7826530612244899


# Selecting best model with best hyper parameters on dimond dataset

In [6]:
df= sns.load_dataset('diamonds')
df.head(5)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [7]:
# Assuming your DataFrame is named 'df'
column_data_types = df.dtypes
print(column_data_types)


carat       float64
cut        category
color      category
clarity    category
depth       float64
table       float64
price         int64
x           float64
y           float64
z           float64
dtype: object


In [8]:

# encode the cut, color and clarity variables
encoder = LabelEncoder()
df['cut'] = encoder.fit_transform(df['cut'])
df['color'] = encoder.fit_transform(df['color'])
df['clarity'] = encoder.fit_transform(df['clarity'])


# split the data into train and test data with 80% train and 20% test
X_train, X_test, y_train, y_test = train_test_split(df.drop('price', axis=1), df['price'], test_size=0.2, random_state=42)

In [9]:
# creatae a dictionary of list of models to evaluate performance 
models = {
    'Linear Regression': LinearRegression(),
    'SVR': SVR(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'XGBoost': XGBRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# train and predict each model with evaluation metrics as well making for loop
model_score= []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(X_train, y_train)
    # make predictions from each model
    y_pred = model.predict(X_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_score.append((name, metric))






# selecting the best model from all above models with evaluation metrics sorting
sorted_models= sorted(model_score, key=lambda x: x[1], reverse=False)
for model in sorted_models:
    print('Mean absolute errror:', model[0], model[1])

Mean absolute errror: Random Forest 267.5912510932194
Mean absolute errror: XGBoost 277.94127706125306
Mean absolute errror: Decision Tree 353.9270949202818
Mean absolute errror: GradientBoostingRegressor 364.84907102481833
Mean absolute errror: KNeighborsRegressor 480.76881720430106
Mean absolute errror: Linear Regression 858.7084697710096
Mean absolute errror: SVR 2747.3256009251854


Hyper parameter

In [11]:
%%time
# Create a dictionaries of list of models to evaluate performance with hyperparameters
models = { 
          'LinearRegression' : (LinearRegression(), {}),
          'SVR' : (SVR(), {'kernel': ['rbf', 'poly', 'sigmoid']}),
          'DecisionTreeRegressor' : (DecisionTreeRegressor(), {'max_depth': [None, 5, 10]}),
          'RandomForestRegressor' : (RandomForestRegressor(), {'n_estimators': [10, 100]}),
          'KNeighborsRegressor' : (KNeighborsRegressor(), {'n_neighbors': np.arange(3, 100, 2)}),
          'GradientBoostingRegressor' : (GradientBoostingRegressor(), {'n_estimators': [10, 100]}),
          'XGBRegressor' : (XGBRegressor(), {'n_estimators': [10, 100]}),          
          }

# train and predict each model with evaluation metrics as well making a for loop to iterate over the models

for name, (model, params) in models.items():
    # create a pipline
    pipeline = GridSearchCV(model, params, cv=5)
    
    # fit the pipeline
    pipeline.fit(X_train, y_train)
    
    # make prediction from each model
    y_pred = pipeline.predict(X_test)
    
      
    # print the performing metric
    print(name, 'MSE: ', mean_squared_error(y_test, y_pred))
    print(name, 'R2: ', r2_score(y_test, y_pred))
    print(name, 'MAE: ', mean_absolute_error(y_test, y_pred))
    print('\n')

LinearRegression MSE:  1825912.9915253515
LinearRegression R2:  0.885139743367963
LinearRegression MAE:  858.7084697710096


