### I am considering match_score as output (Y) 
### So i will be using reading and writing scores as verbal score


### Since i havent found any relation between parental education and performance i will drop the parental education feature

In [120]:
import pandas as pd
import numpy as np
#models
from sklearn.preprocessing import StandardScaler,OneHotEncoder,OrdinalEncoder,FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error,r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline  
from catboost import CatBoostRegressor
from xgboost import XGBRegressor

In [121]:
df=pd.read_csv('data/stud.csv')

In [122]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [123]:
X=df.drop(columns=['math_score'],axis=1)
y=df['math_score']

In [124]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [125]:
numerical_col=[feature for feature in X.columns if X[feature].dtype!='O']
ordinal_col=['parental_level_of_education']
nominal_col=[feature for feature in X.columns if feature not in numerical_col and feature not in ordinal_col]

print('Numerical columns:',numerical_col)
print('Ordinal columns:',ordinal_col)
print('Nominal columns:',nominal_col)

Numerical columns: ['reading_score', 'writing_score']
Ordinal columns: ['parental_level_of_education']
Nominal columns: ['gender', 'race_ethnicity', 'lunch', 'test_preparation_course']


In [126]:
Scaling_transformer=StandardScaler()
OneHot_transformer=OneHotEncoder(sparse_output=False,handle_unknown='ignore',drop='first')
education_order = [
    ['some high school', 'high school', 'some college', "associate's degree", "bachelor's degree", "master's degree"]
]
ord_transformer = OrdinalEncoder(categories=education_order)

In [127]:
Preprocessor = ColumnTransformer(
    transformers=[
        ('num', Scaling_transformer, numerical_col),
        ('ord', ord_transformer, ordinal_col),
        ('nom', OneHot_transformer, nominal_col)
    ],verbose_feature_names_out=False
)


In [129]:
train_transformed=Preprocessor.fit_transform(X_train)
test_transformed=Preprocessor.transform(X_test)

In [130]:
transformed_feature_names = Preprocessor.get_feature_names_out()

In [131]:
train_transformed_df = pd.DataFrame(
    train_transformed, 
    columns=transformed_feature_names
)

test_transformed_df = pd.DataFrame(
    test_transformed, 
    columns=transformed_feature_names
)

In [132]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'XGBoost': XGBRegressor(random_state=42, verbosity=0),
    'CatBoost': CatBoostRegressor(verbose=0, random_state=42)
}

In [133]:
results = {}

for name, model in models.items():
    print(f"--- Training {name} ---")

    # 1. Train the model using the transformed data
    model.fit(train_transformed_df, y_train)

    # 2. Make predictions on the test set
    y_pred = model.predict(test_transformed_df)

    # 3. Evaluate the model (using a metric like R2 Score)
    from sklearn.metrics import r2_score
    test_score = r2_score(y_test, y_pred)

    # 4. Store the results
    results[name] = test_score

# Print all final results
print("\n--- Final Model R2 Scores ---")
print(results)

--- Training Linear Regression ---
--- Training Decision Tree ---
--- Training Random Forest ---
--- Training XGBoost ---
--- Training CatBoost ---

--- Final Model R2 Scores ---
{'Linear Regression': 0.8815597679452446, 'Decision Tree': 0.7441834087838171, 'Random Forest': 0.8498842243383807, 'XGBoost': 0.8209482431411743, 'CatBoost': 0.8468233518808057}
