In [2]:
import kagglehub
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge
import matplotlib.pyplot as plt
import os


In [3]:

# Download latest version
path = kagglehub.dataset_download("spscientist/students-performance-in-exams")
data = pd.read_csv(os.path.join(path, "StudentsPerformance.csv"))
data.head()

Unnamed: 0,gender,race/ethnicity,parental level of education,lunch,test preparation course,math score,reading score,writing score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
# Now, let's start with finding features that actually help us with deducting which feature is important
numeric_data = data.select_dtypes(include=[np.number])
correlation_matrix = numeric_data.corr()
print(correlation_matrix["math score"].sort_values(ascending=False))
# So, math score is highly correlated with reading score and writing score. Let's use these two features to predict math score.

math score       1.000000
reading score    0.817580
writing score    0.802642
Name: math score, dtype: float64


In [6]:
X = data[["reading score", "writing score"]]
y = data["math score"]


In [7]:
# Data Preprocessing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [10]:
# Model evaluation with Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Linear Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Linear Regression - Accuracy: {accuracy}")

#68% is not bad. Let's try Lasso Regression.

Linear Regression - MSE: 77.24297821278955, R2: 0.6825697127424626
Linear Regression - Accuracy: 0.6825697127424626


In [17]:
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test) 
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Lasso Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Lasso Regression - Accuracy: {accuracy}")


Lasso Regression - MSE: 77.28976844057277, R2: 0.6823774281388685
Lasso Regression - Accuracy: 0.6823774281388685


In [None]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Ridge Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Ridge Regression - Accuracy: {accuracy}")
# Still, everything is around 68%. Let's try another dataset. Why is this? Maybe the features are not good enough.
# Maybe, we need more features to predict math score.

Ridge Regression - MSE: 77.2654829746764, R2: 0.6824772293453216
Ridge Regression - Accuracy: 0.6824772293453216


In [19]:
# Let's add in Polyminial Features
from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size = 0.2, random_state = 42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [20]:
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Polynomial Linear Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Polynomial Linear Regression - Accuracy: {accuracy}")


Polynomial Linear Regression - MSE: 75.90053253356909, R2: 0.6880864979240058
Polynomial Linear Regression - Accuracy: 0.6880864979240058


In [21]:
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Ridge Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Ridge Regression - Accuracy: {accuracy}")


Ridge Regression - MSE: 76.48855017942581, R2: 0.6856700373659652
Ridge Regression - Accuracy: 0.6856700373659652


In [22]:
model = Lasso(alpha=0.1)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Lasso Regression - MSE: {mse}, R2: {r2}")
accuracy = model.score(X_test, y_test)
print(f"Lasso Regression - Accuracy: {accuracy}")

Lasso Regression - MSE: 77.30140430341237, R2: 0.6823296105201138
Lasso Regression - Accuracy: 0.6823296105201138
