# Importing Libraries:

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")


In [2]:
df=pd.read_csv(r'C:\Users\DELL\Documents\python assignment\Project1.ipynb\notebook\notebook\stud.csv')
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [None]:
df['total_marks'] = df['math_score'] + df['reading_score'] + df['writing_score']  #adding total marks column
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score,total_marks
0,female,group B,bachelor's degree,standard,none,72,72,74,218
1,female,group C,some college,standard,completed,69,90,88,247
2,female,group B,master's degree,standard,none,90,95,93,278
3,male,group A,associate's degree,free/reduced,none,47,57,44,148
4,male,group C,some college,standard,none,76,78,75,229


# Preparing X and Y veriables:

In [25]:
x=df.drop(columns=['total_marks','math_score','writing_score','reading_score']) #features
y=df['total_marks'] #target variable

In [11]:
x.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course
0,female,group B,bachelor's degree,standard,none
1,female,group C,some college,standard,completed
2,female,group B,master's degree,standard,none
3,male,group A,associate's degree,free/reduced,none
4,male,group C,some college,standard,none


In [12]:
y.head()

0    218
1    247
2    278
3    148
4    229
Name: total_marks, dtype: int64

In [9]:
# All the categories in each column:
print('Categories in gender',df['gender'].unique())
print('Categories in race_ethnicity',df['race_ethnicity'].unique())
print('Categories in parental_level_of_education ',df['parental_level_of_education'].unique())
print('Categories in lunch',df['lunch'].unique())
print('Categories in test_preparation_course ',df['test_preparation_course'].unique())

Categories in gender ['female' 'male']
Categories in race_ethnicity ['group B' 'group C' 'group A' 'group D' 'group E']
Categories in parental_level_of_education  ["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']
Categories in lunch ['standard' 'free/reduced']
Categories in test_preparation_course  ['none' 'completed']


In [None]:
# Create column tranformer with 3 types of transformers
numeric_feature=x.select_dtypes(include=['int64','float64']).columns
categorical_feature=x.select_dtypes(include=['object']).columns

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

OneHotEncoder= OneHotEncoder(handle_unknown='ignore', sparse_output=False)
StandardScaler=StandardScaler(with_mean=True, with_std=True)

preprocessor=ColumnTransformer([
    ('onehotencoder',OneHotEncoder,categorical_feature),
    ('StandardScaler',StandardScaler, numeric_feature)
])

In [26]:
x=preprocessor.fit_transform(x) # fitting the preprocessor on x

In [27]:
x

array([[1., 0., 0., ..., 1., 0., 1.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [1., 0., 0., ..., 0., 1., 0.],
       [1., 0., 0., ..., 1., 1., 0.],
       [1., 0., 0., ..., 0., 0., 1.]], shape=(1000, 17))

In [28]:
from sklearn.model_selection import train_test_split
# Splitting the dataset into training and testing sets
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape #shape of the data after splitting

((800, 17), (200, 17), (800,), (200,))

In [31]:


from sklearn.metrics import r2_score, mean_squared_error

# Define all models in a dictionary
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(random_state=42),
    "SVR": SVR(),
    "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
    "CatBoostRegressor": CatBoostRegressor(verbose=0, random_state=42),
    "XGBRegressor": XGBRegressor(verbosity=0, random_state=42),
    "DecisionTreeRegressor": DecisionTreeRegressor(random_state=42)
}

results = []

for name, model in models.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    r2 = r2_score(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    results.append({"Model": name, "R2 Score": r2, "RMSE": rmse})



In [32]:
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2 Score", ascending=False))

                       Model  R2 Score       RMSE
0           LinearRegression  0.162172  40.204740
2                        SVR  0.090561  41.887696
3  GradientBoostingRegressor  0.085765  41.998003
1      RandomForestRegressor -0.025258  44.475033
4          CatBoostRegressor -0.046924  44.942505
5               XGBRegressor -0.070473  45.445164
6      DecisionTreeRegressor -0.075330  45.548139
