In [1]:
!git clone https://github.com/wlw2021/DS-for-Business

%cd DS-for-Business/'proposal datasets'/"students' performance"/

Cloning into 'DS-for-Business'...
remote: Enumerating objects: 57, done.[K
remote: Counting objects: 100% (31/31), done.[K
remote: Compressing objects: 100% (26/26), done.[K
remote: Total 57 (delta 8), reused 15 (delta 3), pack-reused 26[K
Receiving objects: 100% (57/57), 199.34 MiB | 13.17 MiB/s, done.
Resolving deltas: 100% (10/10), done.
Updating files: 100% (22/22), done.
/content/DS-for-Business/proposal datasets/students' performance


**Details of DataSet**

In [2]:
import pandas as pd
import subprocess

math_data = pd.read_csv('student-mat.csv', sep=';')
portuguese_data = pd.read_csv('student-por.csv', sep=';')

print("Columns of math_data:", math_data.columns)
print("Columns of portuguese_data:", portuguese_data.columns)

common_columns = ['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
                  'Mjob', 'Fjob', 'reason', 'guardian']

merged_data = pd.merge(math_data, portuguese_data, how='inner', on=common_columns)

print("Shape of merged dataset:", merged_data.shape)

Columns of math_data: Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
Columns of portuguese_data: Index(['school', 'sex', 'age', 'address', 'famsize', 'Pstatus', 'Medu', 'Fedu',
       'Mjob', 'Fjob', 'reason', 'guardian', 'traveltime', 'studytime',
       'failures', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery',
       'higher', 'internet', 'romantic', 'famrel', 'freetime', 'goout', 'Dalc',
       'Walc', 'health', 'absences', 'G1', 'G2', 'G3'],
      dtype='object')
Shape of merged dataset: (383, 54)


**Merge DataSet**

In [None]:
import pandas as pd

math_data = pd.read_csv('student-mat.csv', sep=';')

portuguese_data = pd.read_csv('student-por.csv', sep=';')

merged_data = pd.concat([math_data, portuguese_data], ignore_index=True)

print("Shape of merged dataset:", merged_data.shape)

Shape of merged dataset: (1044, 33)


**Linear Regression Script with Cross-Validation: Baseline**

In [3]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

X = math_data.drop('G3', axis=1)
y = math_data['G3']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = make_pipeline(preprocessor, LinearRegression())
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
model.fit(X_train, y_train)

ln_avg_rmse = np.sqrt(-cv_scores.mean())
y_pred = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Average Root Mean Squared Error (RMSE) for Linear Regression with Cross-Validation:", test_rmse,ln_avg_rmse)

Average Root Mean Squared Error (RMSE) for Linear Regression with Cross-Validation: 2.3786318068573045 1.9059454401016278


In [4]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)


R-squared (R2) Score: 0.7240733367435499


**Adding 'performance' metric**

In [5]:
X['performance'] = X['studytime'] / (X['failures'] +X['freetime'] + X['traveltime'])

**Linear Regression: Engineered features**

In [6]:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X.select_dtypes(include=['object']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

categorical_transformer = OneHotEncoder(handle_unknown='ignore')
numerical_transformer = StandardScaler()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

model = make_pipeline(preprocessor, LinearRegression())
cv_scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
model.fit(X_train, y_train)

ln_avg_rmse = np.sqrt(-cv_scores.mean())
y_pred_engg = model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print("Average Root Mean Squared Error (RMSE) for Linear Regression with Cross-Validation:", test_rmse,ln_avg_rmse)

Average Root Mean Squared Error (RMSE) for Linear Regression with Cross-Validation: 2.3786318068573045 1.9182785732072463


In [7]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, y_pred_engg)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7240111906750624


**Decision Tree Regression with Cross-Validation**

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import GridSearchCV
math_data = pd.read_csv('student-mat.csv', delimiter=';')
portuguese_data = pd.read_csv('student-por.csv', delimiter=';')



categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

transformed_data = preprocessor.fit_transform(X_train)
transformed_test_data=preprocessor.fit_transform(X_test)

tree_model = DecisionTreeRegressor()


param_grid = {
    'max_depth': [ 10, 20, 30],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [ 'sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'random_state': [42]  # Random seed for reproducibility
}
grid_search = GridSearchCV(estimator=tree_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(transformed_data, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_tree_model = grid_search.best_estimator_

cv_scores = cross_val_score(best_tree_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')

dt_avg_rmse = (-cv_scores.mean()) ** 0.5
y_pred = best_tree_model.predict(transformed_test_data)
test_dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Average Root Mean Squared Error (RMSE) for Decision Tree Regression with Cross-Validation:", dt_avg_rmse,test_dt_rmse)


Best Parameters: {'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 2, 'random_state': 42}
Average Root Mean Squared Error (RMSE) for Decision Tree Regression with Cross-Validation: 3.1893754157830636 2.5490888821289404


In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_tree_model.predict(transformed_test_data)

r2 = r2_score(y_test, y_pred_tree)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7054444312881136


**Random Forest Regression with Cross-Validation**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_features': ['sqrt', 'log2'],  # Number of features to consider when looking for the best split
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used when building trees
}

categorical_features = X.select_dtypes(include=['object']).columns
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)])

forest_model = RandomForestRegressor()

grid_search = GridSearchCV(estimator=forest_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(transformed_data, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_forest_model = grid_search.best_estimator_

cv_scores = cross_val_score(best_forest_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')

pipeline = make_pipeline(preprocessor, model)


rf_avg_rmse = (-cv_scores.mean()) ** 0.5
y_pred = best_forest_model.predict(transformed_test_data)
test_rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Average Root Mean Squared Error (RMSE) for Random Forest Regression with Cross-Validation:", rf_avg_rmse,test_rf_rmse)

Best Parameters: {'bootstrap': False, 'max_depth': 20, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Average Root Mean Squared Error (RMSE) for Random Forest Regression with Cross-Validation: 2.2840037958667514 2.270570768314445


In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_forest_model.predict(transformed_test_data)

r2 = r2_score(y_test, y_pred_tree)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7485745072164225


**Support Vector Regression (SVR)**

In [None]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

param_grid = {
    'kernel': ['linear', 'rbf'],  # Kernel type: linear or radial basis function (RBF)
    'C': [0.1, 1, 10, 100],        # Regularization parameter
    'gamma': [0.01, 0.1, 1, 'scale']  # Kernel coefficient for RBF kernel
}

transformed_data=preprocessor.fit_transform(X_train)

svr_model = SVR()

grid_search = GridSearchCV(estimator=svr_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(transformed_data, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_svr_model = grid_search.best_estimator_

svr_y_pred = best_svr_model.predict(preprocessor.fit_transform(X_test))
svr_test_rmse = np.sqrt(mean_squared_error(y_test, svr_y_pred))
cv_scores = cross_val_score(best_forest_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')
svr_avg_rmse = (-cv_scores.mean()) ** 0.5





print("Support Vector Regression RMSE:", svr_avg_rmse,svr_test_rmse)

Best Parameters: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
Support Vector Regression RMSE: 2.207146677117696 2.2270662428330024


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, svr_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7581169170143491


**Gradient Boosting Regression**

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import ElasticNet,Lasso,Ridge

gb_preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

param_grid = {
    'n_estimators': [50, 100, 200],  # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate (shrinkage)
    'max_depth': [3, 4, 5],  # Maximum depth of each tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required to be at a leaf node
    'max_features': [ 'sqrt', 'log2']  # Number of features to consider at each split
}


gb_model = GradientBoostingRegressor()

transformed_data=gb_preprocessor.fit_transform(X_train)


grid_search = GridSearchCV(estimator=gb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(transformed_data, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_gb_model = grid_search.best_estimator_

best_gb_model.fit(transformed_data, y_train)



cv_scores = cross_val_score(best_gb_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')
gb_avg_rmse = (-cv_scores.mean()) ** 0.5
gb_y_pred = best_gb_model.predict(gb_preprocessor.fit_transform(X_test))
gb_test_rmse = np.sqrt(mean_squared_error(y_test, gb_y_pred))

print("Gradient Boosting Regression RMSE:", gb_avg_rmse, gb_test_rmse)

Best Parameters: {'learning_rate': 0.1, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200}
Gradient Boosting Regression RMSE: 1.9119619486498067 2.178072709435087


In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_tree_model.predict(transformed_test_data)

r2 = r2_score(y_test, gb_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7686422925259931


**Ridge Regression:**

In [None]:
from sklearn.preprocessing import OneHotEncoder

param_grid = {
    'alpha': [0.1,100,1,10]
}

encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object']))

X_numeric = X_train.select_dtypes(exclude=['object'])

X_processed = np.concatenate((X_numeric, X_encoded), axis=1)

from sklearn.linear_model import Ridge

ridge_model = Ridge()
grid_search = GridSearchCV(estimator=ridge_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_processed, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_ridge_model = grid_search.best_estimator_
ridge_cv_scores = cross_val_score(best_ridge_model, X_processed, y_train, cv=5, scoring='neg_mean_squared_error')
ridge_avg_rmse = np.sqrt(-ridge_cv_scores.mean())
ridge_y_pred = best_gb_model.predict(gb_preprocessor.fit_transform(X_test))
ridge_test_rmse = np.sqrt(mean_squared_error(y_test, ridge_y_pred))

print("Ridge Regression RMSE:", ridge_avg_rmse,ridge_test_rmse)

Best Parameters: {'alpha': 100}
Ridge Regression RMSE: 1.8228755239974268 2.178072709435087




In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_tree_model.predict(transformed_test_data)

r2 = r2_score(y_test, ridge_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7686422925259931


**Lasso Regression:**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_columns = X_train.select_dtypes(include=['object']).columns

param_grid = {
    'alpha': [0.1,100,1,10]
}

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ])

X_train_encoded = preprocessor.fit_transform(X_train)

lasso_model = Lasso()
grid_search = GridSearchCV(estimator=lasso_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_processed, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_lasso_model = grid_search.best_estimator_
lasso_cv_scores = cross_val_score(best_lasso_model, X_train_encoded, y_train, cv=5, scoring='neg_mean_squared_error')
lasso_avg_rmse = np.sqrt(-lasso_cv_scores.mean())
lasso_y_pred = best_gb_model.predict(gb_preprocessor.fit_transform(X_test))
lasso_test_rmse = np.sqrt(mean_squared_error(y_test, gb_y_pred))

print("Lasso Regression RMSE:", lasso_avg_rmse,lasso_test_rmse)

Best Parameters: {'alpha': 0.1}
Lasso Regression RMSE: 4.592068280842504 2.178072709435087


In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_tree_model.predict(transformed_test_data)

r2 = r2_score(y_test, lasso_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7686422925259931


**ElasticNet Regression:**

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_columns = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ])

X_train_encoded = preprocessor.fit_transform(X_train)

elasticnet_model = ElasticNet()
grid_search = GridSearchCV(estimator=elasticnet_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_train_encoded, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_elasticnet_model = grid_search.best_estimator_

elasticnet_cv_scores = cross_val_score(best_elasticnet_model, X_train_encoded, y_train, cv=5, scoring='neg_mean_squared_error')
elasticnet_avg_rmse = np.sqrt(-elasticnet_cv_scores.mean())
elastic_y_pred = best_elasticnet_model.predict(preprocessor.fit_transform(X_test))
elastic_test_rmse = np.sqrt(mean_squared_error(y_test, elastic_y_pred))

print("ElasticNet Regression RMSE:", elasticnet_avg_rmse,elastic_test_rmse)

Best Parameters: {'alpha': 100}
ElasticNet Regression RMSE: 4.58964750687397 4.550180652162387


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, elastic_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: -0.009709643515769084


**K-Nearest Neighbors Regression (KNN):**

In [None]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import numpy as np

categorical_columns = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ])

transformed_data=preprocessor.fit_transform(X_train)

knn_model = KNeighborsRegressor()

knn_model.fit(transformed_data,y_train)

knn_cv_scores = cross_val_score(knn_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')

knn_avg_rmse = np.sqrt(-knn_cv_scores.mean())

knn_y_pred = knn_model.predict(preprocessor.fit_transform(X_test))
knn_test_rmse = np.sqrt(mean_squared_error(y_test, knn_y_pred))



print("KNeighborsRegressor RMSE:", knn_avg_rmse,knn_test_rmse)

KNeighborsRegressor RMSE: 4.948744028571033 4.808747303915168


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, knn_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: -0.12772481480323838


**LightGBM Regression:**

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
import lightgbm as lgb
import numpy as np

param_grid = {
    'n_estimators': [50, 100, 200],            # Number of boosting stages (trees)
    'learning_rate': [0.01, 0.1, 0.2],          # Learning rate (shrinkage)
    'max_depth': [3, 4, 5, -1],                 # Maximum depth of each tree (-1 means no limit)
    'subsample': [0.8, 1.0],                    # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0],             # Subsample ratio of columns when constructing each tree
    'min_child_samples': [20, 50, 100],         # Minimum number of samples required to create a leaf node
    'reg_alpha': [0.0, 0.1, 0.5],               # L1 regularization term on weights
    'reg_lambda': [0.0, 0.1, 0.5]               # L2 regularization term on weights
}


categorical_columns = X_train.select_dtypes(include=['object']).columns

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(), categorical_columns)
    ])

lgb_model = lgb.LGBMRegressor()

transformed_data=preprocessor.fit_transform(X_train)

grid_search = GridSearchCV(estimator=lgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(transformed_data, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_lgb_model = grid_search.best_estimator_
lgb_cv_scores = cross_val_score(best_lgb_model, transformed_data, y_train, cv=5, scoring='neg_mean_squared_error')

lgb_avg_rmse = np.sqrt(-lgb_cv_scores.mean())

lgb_y_pred = best_lgb_model.predict(preprocessor.fit_transform(X_test))
lgb_test_rmse = np.sqrt(mean_squared_error(y_test, lgb_y_pred))

print("LightGBM Regression RMSE:", lgb_avg_rmse)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000036 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 18
[LightGBM] [Info] Number of data points in the train set: 253, number of used features: 9
[LightGBM] [Info] Start training from score 10.272727
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000032 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 253, number of used features: 7
[LightGBM] [Info] Start training from score 10.225296
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000033 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14
[LightGBM] [Info] Number of data points in the train set: 253, nu

In [None]:
from sklearn.metrics import r2_score
y_pred_tree = best_tree_model.predict(transformed_test_data)

r2 = r2_score(y_test, lgb_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: -0.01833166249270457


In [None]:
print("LightGBM Regression RMSE:", lgb_test_rmse)

LightGBM Regression RMSE: 4.569566595857988


**CatBoost Regression:**

In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [None]:
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostRegressor
from sklearn.model_selection import GridSearchCV
categorical_features = X_train.select_dtypes(include=['object']).columns

#Calculate the 'performance' column based on the formula


encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

X_train_processed = np.concatenate([X_train_encoded.toarray(), X_train.drop(columns=categorical_features)], axis=1)
X_test_processed = np.concatenate([X_test_encoded.toarray(), X_test.drop(columns=categorical_features)], axis=1)

param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'depth': [4, 6, 8],
    'l2_leaf_reg': [1, 3, 5]
}
# Initialize GridSearchCV with CatBoostRegressor
catboost_model = CatBoostRegressor()
grid_search = GridSearchCV(estimator=catboost_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_train_processed, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_catboost_model = grid_search.best_estimator_



catboost_cv_scores = cross_val_score(best_catboost_model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')
catboost_avg_rmse = np.sqrt(-catboost_cv_scores.mean())





print("CatBoost Regression RMSE:", catboost_avg_rmse)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
1:	learn: 3.9898320	total: 5.3ms	remaining: 2.64s
2:	learn: 3.7627022	total: 6.34ms	remaining: 2.11s
3:	learn: 3.5600625	total: 7.33ms	remaining: 1.82s
4:	learn: 3.3695889	total: 8.4ms	remaining: 1.67s
5:	learn: 3.1827732	total: 9.38ms	remaining: 1.55s
6:	learn: 3.0483004	total: 10.3ms	remaining: 1.47s
7:	learn: 2.8912730	total: 11.3ms	remaining: 1.4s
8:	learn: 2.7607066	total: 12.2ms	remaining: 1.35s
9:	learn: 2.6289785	total: 13.4ms	remaining: 1.33s
10:	learn: 2.5387898	total: 14.5ms	remaining: 1.3s
11:	learn: 2.4325885	total: 15.6ms	remaining: 1.28s
12:	learn: 2.3381820	total: 16.7ms	remaining: 1.27s
13:	learn: 2.2614842	total: 17.7ms	remaining: 1.25s
14:	learn: 2.1923826	total: 18.6ms	remaining: 1.22s
15:	learn: 2.1273986	total: 19.6ms	remaining: 1.21s
16:	learn: 2.0776416	total: 20.5ms	remaining: 1.19s
17:	learn: 2.0259859	total: 21.5ms	remaining: 1.17s
18:	learn: 1.9595846	total: 22.4ms	remaining: 1.16s
19:	learn: 1

In [None]:
catboost_y_pred = best_catboost_model.predict(X_test_processed)
catboost_test_rmse = np.sqrt(mean_squared_error(y_test, catboost_y_pred))

print("CatBoost Regression RMSE:", catboost_avg_rmse,catboost_test_rmse)

CatBoost Regression RMSE: 1.4395702352597541 2.060767703450335


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, catboost_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7928917952704377


In [None]:
print("Best Parameters:", grid_search.best_params_)

Best Parameters: {'depth': 4, 'l2_leaf_reg': 3, 'learning_rate': 0.1}


**Huber Regression:**


In [None]:
from sklearn.linear_model import HuberRegressor
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import OneHotEncoder
#from catboost import CatBoostRegressor

categorical_features = X_train.select_dtypes(include=['object']).columns

#Calculate the 'performance' column based on the formula


encoder = OneHotEncoder(handle_unknown='ignore')
X_train_encoded = encoder.fit_transform(X_train[categorical_features])
X_test_encoded = encoder.transform(X_test[categorical_features])

X_train_processed = np.concatenate([X_train_encoded.toarray(), X_train.drop(columns=categorical_features)], axis=1)
X_test_processed = np.concatenate([X_test_encoded.toarray(), X_test.drop(columns=categorical_features)], axis=1)

param_grid = {
    'epsilon': [1.0, 1.5, 2.0],  # Huber loss threshold
    'alpha': [0.0001, 0.001, 0.01],  # Regularization parameter
    'max_iter': [ 1000, 2000]  # Maximum number of iterations
}


huber_model = HuberRegressor()
grid_search = GridSearchCV(estimator=huber_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_train_processed, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_huber_model = grid_search.best_estimator_

huber_cv_scores = cross_val_score(best_huber_model, X_train_processed, y_train, cv=5, scoring='neg_mean_squared_error')
huber_avg_rmse = np.sqrt(-huber_cv_scores.mean())

huber_y_pred = best_huber_model.predict(X_test_processed)
huber_test_rmse = np.sqrt(mean_squared_error(y_test, huber_y_pred))


print("Huber Regression RMSE:", huber_avg_rmse,huber_test_rmse)

Best Parameters: {'alpha': 0.001, 'epsilon': 2.0, 'max_iter': 1000}
Huber Regression RMSE: 1.9270341958812907 2.181633393365549


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, huber_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.7678852332879023


**XGBoost Regression:**

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse=False)
X_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object']))

X_numeric = X_train.select_dtypes(exclude=['object'])

X_processed = np.concatenate((X_numeric, X_encoded), axis=1)

X_encoded = encoder.fit_transform(X_test.select_dtypes(include=['object']))

X_numeric = X_test.select_dtypes(exclude=['object'])

X_test_processed = np.concatenate((X_numeric, X_encoded), axis=1)

from xgboost import XGBRegressor

param_grid = {
    'n_estimators': [50, 100, 200],            # Number of boosting rounds (trees)
    'learning_rate': [0.01, 0.1, 0.2],          # Learning rate (shrinkage)
    'max_depth': [3, 4, 5, 6],                  # Maximum depth of each tree
    'subsample': [0.8, 1.0],                    # Subsample ratio of the training instances
    'colsample_bytree': [0.8, 1.0],             # Subsample ratio of columns when constructing each tree
    'min_child_weight': [1, 5, 10],              # Minimum sum of instance weight (hessian) needed in a child
    'gamma': [0, 0.1, 0.5],                     # Minimum loss reduction required to make a further partition
    'reg_alpha': [0.0, 0.1, 0.5],               # L1 regularization term on weights
    'reg_lambda': [0.0, 0.1, 0.5]               # L2 regularization term on weights
}


xgb_model = XGBRegressor()

grid_search = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')

# Perform grid search on training data
grid_search.fit(X_processed, y_train)

print("Best Parameters:", grid_search.best_params_)

# Get the best model
best_xgb_model = grid_search.best_estimator_

xgb_cv_scores = cross_val_score(best_xgb_model, X_processed, y_train, cv=5, scoring='neg_mean_squared_error')
xgb_avg_rmse = np.sqrt(-xgb_cv_scores.mean())

xgboost_y_pred = best_xgb_model.predict(X_test_processed)
xgboost_test_rmse = np.sqrt(mean_squared_error(y_test, xgboost_y_pred))

print("XGBoost Regression RMSE:", xgb_avg_rmse,xgboost_test_rmse)



Best Parameters: {'colsample_bytree': 1.0, 'gamma': 0.1, 'learning_rate': 0.1, 'max_depth': 5, 'min_child_weight': 10, 'n_estimators': 200, 'reg_alpha': 0.5, 'reg_lambda': 0.5, 'subsample': 0.8}
XGBoost Regression RMSE: 1.3413800853931719 1.9907655138122629


In [None]:
from sklearn.metrics import r2_score


r2 = r2_score(y_test, xgboost_y_pred)

    # Print R2 score for the current model

print("R-squared (R2) Score:", r2)

R-squared (R2) Score: 0.8067233262536148


**Models Comparisson**

In [None]:
print("1-Linear Regression RMSE:", ln_avg_rmse)
print("2-Decision Tree Regression RMSE:", dt_avg_rmse)
print("3-Random Forest Regression RMSE:", rf_avg_rmse)
print("4-Gradient Boosting Regression RMSE:", gb_test_rmse)
print("5-Support Vector Regression RMSE:", svr_test_rmse)
print("6-Ridge Regression RMSE:", ridge_avg_rmse)
print("7-Lasso Regression RMSE:", lasso_avg_rmse)
print("8-ElasticNet Regression RMSE:", elasticnet_avg_rmse)
print("9-KNN Regression RMSE:", knn_avg_rmse)
print("10-LightGBM Regression RMSE:", lgb_avg_rmse)
print("11-CatBoost Regression RMSE:", catboost_avg_rmse)
print("12-Huber Regression RMSE:", huber_avg_rmse)
print("13-XGBoost Regression RMSE:", xgb_avg_rmse)

models_rmse = {
    "1-Linear Regression":ln_avg_rmse ,
    "2-Decision Tree Regression": dt_avg_rmse,
    "3-Random Forest Regression": rf_avg_rmse,
    "4-Gradient Boosting Regression": gb_avg_rmse,
    "5-Support Vector Regression": svr_avg_rmse,
    "6-Ridge Regression": ridge_avg_rmse,
    "7-Lasso Regression": lasso_avg_rmse,
    "8-ElasticNet Regression": elasticnet_avg_rmse,
    "9-KNN Regression": knn_avg_rmse,
    "10-LightGBM Regression": lgb_avg_rmse,
    "11-CatBoost Regression": catboost_avg_rmse,
    "12-Huber Regression": huber_avg_rmse,
    "13-XGBoost Regression\n": xgb_avg_rmse
}

best_model = min(models_rmse, key=models_rmse.get)
print("\nThe best model is:", best_model)

1-Linear Regression RMSE: 1.3294862696097922
2-Decision Tree Regression RMSE: 1.564946568548563
3-Random Forest Regression RMSE: 1.5982537385342068
4-Gradient Boosting Regression RMSE: 1.4538705620696541
5-Support Vector Regression RMSE: 1.2425640448485662
6-Ridge Regression RMSE: 1.2940119864889343
7-Lasso Regression RMSE: 2.9127495947846214
8-ElasticNet Regression RMSE: 2.9127495947846214
9-KNN Regression RMSE: 3.1307121356769585
10-LightGBM Regression RMSE: 2.8794896963313796
11-CatBoost Regression RMSE: 1.2357456574880332
12-Huber Regression RMSE: 1.3171328966090976
13-XGBoost Regression RMSE: 1.2011664328847405

The best model is: 13-XGBoost Regression

