# Selecting the best model with best Hyperparameter

In [1]:
# import libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#train test split the data
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#import regrssion algorithms
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# import grid search cv for cross validation
from sklearn.model_selection import GridSearchCV

# import preprocessor
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline


In [4]:
# load the dataset
df = sns.load_dataset('tips')
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [5]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [6]:
# select features and variables
x = df.drop("tip", axis=1)
y = df['tip']

In [8]:
# label encode categorical variables
le = LabelEncoder()
for col in x.columns:
    if x[col].dtype == "object" or x[col].dtype == "category":
        x[col] = le.fit_transform(x[col])

# Also used this method
# le = labelEncoder()
# x['sex'] = le.fit_transform(x['sex'])
# x['smoker'] = le.fit_transform(x['smoker'])
# x['day'] = le.fit_transform(x['day'])
# x['time'] = le.fit_transform(x['time'])

In [9]:
# train test split the data
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [12]:
# create a dictionary of list of models to evaluate the performance
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

#train and predict each mode with evaluation metrics as well makin a fro loop

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(x_train, y_train)

    # make prediction for each model
    y_pred = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

    # Selecting the best model from all above models with evaluation metics sorted
    sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
    for model in sorted_models:
        print("mean_absolute_error", f"{model[0]} is {model[1]: .2f}")

mean_absolute_error LinearRegression is  0.67
mean_absolute_error SVR is  0.57
mean_absolute_error LinearRegression is  0.67
mean_absolute_error SVR is  0.57
mean_absolute_error LinearRegression is  0.67
mean_absolute_error DecisionTreeRegressor is  1.00
mean_absolute_error SVR is  0.57
mean_absolute_error LinearRegression is  0.67
mean_absolute_error RandomForestRegressor is  0.81
mean_absolute_error DecisionTreeRegressor is  1.00
mean_absolute_error SVR is  0.57
mean_absolute_error LinearRegression is  0.67
mean_absolute_error KNeighborsRegressor is  0.73
mean_absolute_error RandomForestRegressor is  0.81
mean_absolute_error DecisionTreeRegressor is  1.00
mean_absolute_error SVR is  0.57
mean_absolute_error LinearRegression is  0.67
mean_absolute_error KNeighborsRegressor is  0.73
mean_absolute_error GradientBoostingRegressor is  0.73
mean_absolute_error RandomForestRegressor is  0.81
mean_absolute_error DecisionTreeRegressor is  1.00


In [14]:
# create a dictionary of list of models to evaluate the performance
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

# train and predict each mode with evaluation metrics as well makin a fro loop

model_scores = []
for name, model in models.items():
    # fit each model from models on training data
    model.fit(x_train, y_train)

    # make prediction for each model
    y_pred = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_scores.append((name, metric))

    # Selecting the best model from all above models with evaluation metics sorted
    # sorted_models = sorted(model_scores, key=lambda x: x[1], reverse=False)
    # for model in sorted_models:
    print("mean_absolute_error", model, metric)

mean_absolute_error LinearRegression() 0.6703807496461157
mean_absolute_error SVR() 0.5707097371316318
mean_absolute_error DecisionTreeRegressor() 0.8767346938775512
mean_absolute_error RandomForestRegressor() 0.7805306122448983
mean_absolute_error KNeighborsRegressor() 0.7262448979591837
mean_absolute_error GradientBoostingRegressor() 0.731835111070683


# Assignmnet Load the Diamond dataset from Seaborn

# step 1


In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# regression model
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR 
from sklearn.tree  import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor

#Regression metrics
from sklearn.metrics import mean_absolute_error, mean_squared_error, f1_score

#import preprocessor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# load the dataset from seaborn with thousand sample
df = sns.load_dataset("diamonds").sample(1000, random_state=42)
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
1388,0.24,Ideal,G,VVS1,62.1,56.0,559,3.97,4.0,2.47
50052,0.58,Very Good,F,VVS2,60.0,57.0,2201,5.44,5.42,3.26
41645,0.4,Ideal,E,VVS2,62.1,55.0,1238,4.76,4.74,2.95
42377,0.43,Premium,E,VVS2,60.8,57.0,1304,4.92,4.89,2.98
17244,1.55,Ideal,E,SI2,62.3,55.0,6901,7.44,7.37,4.61


In [3]:
df.columns

Index(['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'price', 'x', 'y',
       'z'],
      dtype='object')

In [5]:
# label encode categorical data
le = LabelEncoder()
for col in df.columns:
    if df[col].dtype == 'category':
        df[col] = le.fit_transform(df[col])

In [6]:
# Split the data into x label and y label
x = df.drop("price", axis=1)
y = df["price"]

In [7]:
# split th data  into train and test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [17]:
models = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(),
    'DecisionTreeRegressor': DecisionTreeRegressor(),
    'RandomForestRegressor': RandomForestRegressor(),
    'KNeighborsRegressor': KNeighborsRegressor(),
    'GradientBoostingRegressor': GradientBoostingRegressor()
}

model_score = []
for name, model in models.items():
    #fit each model on x_train y_train
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    metric = mean_absolute_error(y_test, y_pred)
    model_score.append((name, metric))

    print(f"Model: {name} MSE: {metric}" )


Model: LinearRegression MSE: 929.7862860799696
Model: SVR MSE: 3061.901283517074
Model: DecisionTreeRegressor MSE: 642.39
Model: RandomForestRegressor MSE: 468.7987416666667
Model: KNeighborsRegressor MSE: 922.809
Model: GradientBoostingRegressor MSE: 473.2564376727215
