In [39]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

#preprocessing
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

#Modelling
from sklearn.model_selection import train_test_split

#regression evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

#Regression Models
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.model_selection import RandomizedSearchCV
from catboost import CatBoostRegressor
from xgboost import XGBRegressor





In [4]:
df = pd.read_csv('./data/stud.csv')

In [7]:
y = df['math_score']
x = df.drop('math_score',axis=1)

In [12]:
num_features = x.select_dtypes(exclude='object').columns
cat_features = x.select_dtypes(include='object').columns

In [None]:
#Create pipeline for transformation

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [("OneHotEncoder", oh_transformer, cat_features),
     ("StandardScaler",numeric_transformer,num_features)]
)

In [22]:
x = preprocessor.fit_transform(x)

In [57]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.20, random_state=42)
x_train.shape, x_test.shape

((800, 19), (200, 19))

In [58]:
#regression problem
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    mape = mean_absolute_percentage_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, mse,mape, rmse, r2_square

In [59]:
#Iteratable models
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(),
    "Ridge": Ridge(),
    "K-Neighbors Regressor": KNeighborsRegressor(),
    "Decision Tree": DecisionTreeRegressor(),
    "Random Forest Regressor": RandomForestRegressor(),
    "XGBRegressor": XGBRegressor(), 
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "AdaBoost Regressor": AdaBoostRegressor()
}
model_list = []
r2_list = []

for i in range(len(list(models))):
    model = list(models.values())[i]
    #fit
    model.fit(x_train, y_train)
    
    #pred
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)
    
    #eval train and test
    mean_abs_err, mean_sq_err, mean_abs_pct_err, root_mean_sq_err, r2_squared = evaluate_model(y_train,y_train_pred)
    mean_abs_err_t, mean_sq_err_t, mean_abs_pct_err_t, root_mean_sq_err_t, r2_squared_t = evaluate_model(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])

    print("- Root Mean Squared Error: {:.4f}".format(root_mean_sq_err))
    print("- Mean Squared Error: {:.4f}".format(mean_sq_err))
    print("- Mean Absolute Error: {:.4f}".format(mean_abs_err))
    print("- Mean Absolute Percentage Error: {:.4f}".format(mean_abs_pct_err))
    print("- R2 Score: {:.4f}".format(r2_squared))

    print('----------------------------------')
    
    print("- Root Mean Squared Error: {:.4f}".format(root_mean_sq_err_t))
    print("- Mean Squared Error: {:.4f}".format(mean_sq_err_t))
    print("- Mean Absolute Error: {:.4f}".format(mean_abs_err_t))
    print("- Mean Absolute Percentage Error: {:.4f}".format(mean_abs_pct_err_t))
    print("- R2 Score: {:.4f}".format(r2_squared_t))
    r2_list.append(r2_squared_t)
    
    print('='*35)
    print('\n')

Linear Regression
- Root Mean Squared Error: 5.3268
- Mean Squared Error: 28.3752
- Mean Absolute Error: 4.2781
- Mean Absolute Percentage Error: 0.0696
- R2 Score: 0.8741
----------------------------------
- Root Mean Squared Error: 5.4108
- Mean Squared Error: 29.2763
- Mean Absolute Error: 4.2279
- Mean Absolute Percentage Error: 86553555338526.7969
- R2 Score: 0.8797


Lasso
- Root Mean Squared Error: 6.5938
- Mean Squared Error: 43.4784
- Mean Absolute Error: 5.2063
- Mean Absolute Percentage Error: 0.0875
- R2 Score: 0.8071
----------------------------------
- Root Mean Squared Error: 6.5197
- Mean Squared Error: 42.5064
- Mean Absolute Error: 5.1579
- Mean Absolute Percentage Error: 353269370898057.3125
- R2 Score: 0.8253


Ridge
- Root Mean Squared Error: 5.3233
- Mean Squared Error: 28.3378
- Mean Absolute Error: 4.2650
- Mean Absolute Percentage Error: 0.0695
- R2 Score: 0.8743
----------------------------------
- Root Mean Squared Error: 5.3904
- Mean Squared Error: 29.0563


In [60]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
2,Ridge,0.880593
0,Linear Regression,0.879689
5,Random Forest Regressor,0.854138
8,AdaBoost Regressor,0.851792
7,CatBoosting Regressor,0.851632
6,XGBRegressor,0.827797
1,Lasso,0.82532
3,K-Neighbors Regressor,0.783898
4,Decision Tree,0.740875
