### I am considering match_score as output (Y) 
### So i will be using reading and writing scores as verbal score


### Since i havent found any relation between parental education and performance i will drop the parental education feature

In [199]:
import pandas as pd
import numpy as np
#models
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [200]:
df=pd.read_csv('data/stud.csv')

In [201]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [202]:
X=df.drop(columns=['math_score','parental_level_of_education'])
y=df['math_score']

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [204]:
numerical_col=[feature for feature in X.columns if X[feature].dtype!='O']
#ordinal_col=['parental_level_of_education']
nominal_col=[feature for feature in X.columns if feature not in numerical_col+ordinal_col]

In [205]:
Scaling_transformer=StandardScaler()
OneHot_transformer=OneHotEncoder(sparse_output=False,handle_unknown='ignore')
Log_transformer=FunctionTransformer(func=np.log1p, validate=False)
education_order = [
    ['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"]
]
ord_transformer = OrdinalEncoder(categories=education_order)

In [206]:
log_and_scale = Pipeline([
    ('log', Log_transformer),
    ('scale', Scaling_transformer)
])

In [207]:
Preprocessor = ColumnTransformer(
    transformers=[
        ('num', log_and_scale, numerical_col),
        #('ord', ord_transformer, ordinal_col),
        ('nom', OneHot_transformer, nominal_col)
    ]
)


In [208]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, verbosity=0),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

pipelines = {name: Pipeline([
    ('preprocessor', Preprocessor),
    ('model', model)
]) for name, model in models.items()}


In [209]:
results = []

for name, pipe in pipelines.items():
    # Fit on train data (preprocessor + model)
    pipe.fit(X_train, y_train)

    # Predict on test data
    y_pred = pipe.predict(X_test)

    # Evaluate
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    results.append({
        'Model': name,
        'MSE': mse,
        'MAE': mae,
        'R2 Score': r2
    })

results_df = pd.DataFrame(results).sort_values(by='R2 Score', ascending=False)
print(results_df)


               Model        MSE       MAE  R2 Score
4           CatBoost  35.282440  4.459682  0.855007
2      Random Forest  37.371257  4.673249  0.846423
3            XGBoost  44.871921  5.172839  0.815599
0  Linear Regression  45.470732  4.878106  0.813138
1      Decision Tree  61.977500  6.305000  0.745303
