In [1]:
# i am importing all the models to know which is the best model for my data
import pandas as pd
import numpy as np
# evalution matrices as the output is a continous data we are importing these evalution matrices if categorical we need
#  binary,categorical entropy
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
#models starting
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso,Ridge
from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

In [2]:
# load dataset
df = pd.read_csv(r"C:\Users\Yesuraju\Desktop\MachineLearningWorkSpace\notebook\data\stud.csv")

In [3]:
# first define our target and independent variables
X = df.drop("reading_score",axis=1)
Y = df["reading_score"]

In [4]:
# here divide our columns into categorical and non-categorical
numerical_features = X.select_dtypes(exclude="object").columns
categorical_features = [col for col in X.columns if X[col].dtype == "object"]

In [5]:
categorical_features

['gender',
 'race_ethnicity',
 'parental_level_of_education',
 'lunch',
 'test_preparation_course']

In [6]:
# creating a pipeline for both categorical and standardization
from sklearn.preprocessing import OneHotEncoder,StandardScaler
# this column transformers will create the pipeline for both categorical and standarization
from sklearn.compose import ColumnTransformer
numerical_standard = StandardScaler()
cate_one = OneHotEncoder()
preprocessor = ColumnTransformer(
        [
                ("OneHotEncoder",cate_one,categorical_features),
                ("StandardScaler",numerical_standard,numerical_features)
        ]
)


In [7]:
# applying both preprocessors to the training data set
X = preprocessor.fit_transform(X)

In [8]:
# separate data set into train test split
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

# creating the evaltion function to find the accuracy of the three models

In [13]:
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error
def eval_func(y_predict,y_true):
        mae = mean_absolute_error(y_true,y_predict)
        r2_square = r2_score(y_true,y_predict)
        return mae,r2_square

In [15]:
# here we are not using any hyperparameter function first we will select model then will hyper parameter tuning on that 
# model
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": RandomForestRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
}

for model_name, model_func in models.items():
        # feeding both train set and test set to the model
        # training the model
        model_func.fit(x_train,y_train)
        y_train_pred = model_func.predict(x_train)
        y_test_pred = model_func.predict(x_test)
        train_mae,train_r2_score = eval_func(y_train,y_train_pred)
        test_mae,test_r2_score = eval_func(y_test,y_test_pred)
        print("Model Name ", model_name)
        print("Training evalution matrices")
        print("train_mae: ",train_mae)
        print("train_r2_score: ",train_r2_score)
        print("\n")
        print("testing evalution matrices")
        print("test_mae:",test_mae)
        print("testing_r2_score:",test_r2_score)
        print("\n")

Model Name  Linear Regression
Training evalution matrices
train_mae:  3.24625
train_r2_score:  0.9167761757679435


testing evalution matrices
test_mae: 3.091875
testing_r2_score: 0.9241673194304232


Model Name  Lasso
Training evalution matrices
train_mae:  3.5112499908580004
train_r2_score:  0.8863885953074425


testing evalution matrices
test_mae: 3.2907847918436492
testing_r2_score: 0.9044007088802761


Model Name  Ridge
Training evalution matrices
train_mae:  3.2307806320523236
train_r2_score:  0.9186198294652381


testing evalution matrices
test_mae: 3.064983081263207
testing_r2_score: 0.9261291830544833


Model Name  K-Neighbors Regressor
Training evalution matrices
train_mae:  3.8175
train_r2_score:  0.8469254393600005


testing evalution matrices
test_mae: 4.662999999999999
testing_r2_score: 0.757100942694698


Model Name  Decision Tree
Training evalution matrices
train_mae:  1.2983099999999999
train_r2_score:  0.9868620617546381


testing evalution matrices
test_mae: 3.454411

In [None]:
# by seeing the above data we can conclude that the ridge regression is the best for our data as it has low varince compared to 
# other models 
# it is having same trining and testing accuracy

In [17]:
# for this ridge model we will do hyperparameter tuning to select best parameters