In [17]:
#importing required modules for eda
import numpy as np
import pandas as pd
#importing required modules for modeling
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from catboost import CatBoostRegressor

In [18]:
df = pd.read_csv('../data/student_performance_factors.csv') #csv file path to read
df.shape #show number of rows and columns

(6607, 20)

In [19]:
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

print('Numerical Features : {} : {}'.format(len(numeric_features), numeric_features))
print('Categorical Features : {} : {}'.format(len(categorical_features), categorical_features))

Numerical Features : 7 : ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores', 'Tutoring_Sessions', 'Physical_Activity', 'Exam_Score']
Categorical Features : 13 : ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities', 'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality', 'School_Type', 'Peer_Influence', 'Learning_Disabilities', 'Parental_Education_Level', 'Distance_from_Home', 'Gender']


In [20]:
#get unique values in categorical columns
for column in categorical_features:
    unique_values = df[column].unique()
    print(f"Unique values in column '{column}': {unique_values}")

Unique values in column 'Parental_Involvement': ['Medium' 'Low' 'High']
Unique values in column 'Access_to_Resources': ['Low' 'High' 'Medium']
Unique values in column 'Extracurricular_Activities': ['No' 'Yes']
Unique values in column 'Motivation_Level': ['Medium' 'High' 'Low']
Unique values in column 'Internet_Access': ['Yes' 'No']
Unique values in column 'Family_Income': ['Low' 'Medium' 'High']
Unique values in column 'Teacher_Quality': ['Medium' 'High' 'Low' nan]
Unique values in column 'School_Type': ['Public' 'Private']
Unique values in column 'Peer_Influence': ['Negative' 'Neutral' 'Positive']
Unique values in column 'Learning_Disabilities': ['No' 'Yes']
Unique values in column 'Parental_Education_Level': ['High School' 'College' 'Postgraduate' nan]
Unique values in column 'Distance_from_Home': ['Near' 'Far' 'Moderate' nan]
Unique values in column 'Gender': ['Male' 'Female']


In [21]:
df.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,3,62,Medium,Low,No,6,67,Medium,Yes,1,Low,Medium,Public,Negative,3,No,High School,Near,Male,55
1,5,65,Low,High,No,7,71,Medium,Yes,0,Low,Medium,Private,Negative,2,No,College,Far,Male,56
2,7,66,High,Low,Yes,8,68,High,Yes,0,Low,Medium,Public,Negative,2,Yes,College,Moderate,Male,57
3,14,67,Low,Low,Yes,7,66,Low,Yes,0,Low,Medium,Public,Neutral,4,No,High School,Far,Female,57
4,9,64,Medium,Low,Yes,10,68,Low,Yes,0,Medium,Medium,Private,Negative,3,No,High School,Near,Male,57


In [22]:
x = df.drop(columns=['Exam_Score'],axis=1) #exam score will be predicted based on rest of the columns.
x.head()

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender
0,3,62,Medium,Low,No,6,67,Medium,Yes,1,Low,Medium,Public,Negative,3,No,High School,Near,Male
1,5,65,Low,High,No,7,71,Medium,Yes,0,Low,Medium,Private,Negative,2,No,College,Far,Male
2,7,66,High,Low,Yes,8,68,High,Yes,0,Low,Medium,Public,Negative,2,Yes,College,Moderate,Male
3,14,67,Low,Low,Yes,7,66,Low,Yes,0,Low,Medium,Public,Neutral,4,No,High School,Far,Female
4,9,64,Medium,Low,Yes,10,68,Low,Yes,0,Medium,Medium,Private,Negative,3,No,High School,Near,Male


In [23]:
y=df['Exam_Score'] #setting value to be predicted to y axis. i.e. exam score.
y.head()

0    55
1    56
2    57
3    57
4    57
Name: Exam_Score, dtype: int64

In [24]:
numeric_features = x.select_dtypes(exclude="object").columns
categorical_features = x.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_features),
         ("StandardScaler", numeric_transformer, numeric_features),        
    ]
)

In [25]:
X = preprocessor.fit_transform(x)

In [26]:
X.shape

(6607, 43)

In [27]:
#spliting training data and test data https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=None) #using 20% to test and 80% for training.
X_train.shape, X_test.shape

((5285, 43), (1322, 43))


### R-Squared (R²)
R-Squared (R²) is a statistical measure used to determine the proportion of variance in a dependent variable that can be predicted or explained by an independent variable.
In other words, R-Squared shows how well a regression model (independent variable) predicts the outcome of observed data (dependent variable).
R-Squared is also commonly known as the coefficient of determination. It is a goodness of fit model for linear regression analysis.

### MAE (Mean Absolute Error)
The MAE value itself indicates the average absolute error between predicted and actual values. The smaller the MAE, the better the model’s predictions align with the actual data.

### RMSE (Root Mean Square Error)
The root mean square error is defined as the measure of the differences between values that are predicted by a model and values that are actually observed.
Generally, the smaller this value generated by a model, the more accurate that model is in predicting the measured values.

In [28]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mse)
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [29]:
models = {
    "Linear Regression": LinearRegression(),
    "CatBoosting Regressor": CatBoostRegressor(verbose=False),
    "Random Forest Regressor": RandomForestRegressor(),
}
model_list = []
r2_list =[]

for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(X_train, y_train) # Train model

    # Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Evaluate Train and Test dataset
    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)
    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print('#'*40)
    print(list(models.keys())[i] + " Performance")
    model_list.append(list(models.keys())[i])
    
    print(' '*10 + 'Training Data' + ' '*10)
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('-'*40)

    print(' '*10 + 'Test Data' + ' '*10)
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)

print('#'*40)

########################################
Linear Regression Performance
          Training Data          
- Root Mean Squared Error: 2.0709
- Mean Absolute Error: 0.4818
- R2 Score: 0.7209
----------------------------------------
          Test Data          
- Root Mean Squared Error: 2.1024
- Mean Absolute Error: 0.4902
- R2 Score: 0.7261
########################################
CatBoosting Regressor Performance
          Training Data          
- Root Mean Squared Error: 1.0922
- Mean Absolute Error: 0.3967
- R2 Score: 0.9224
----------------------------------------
          Test Data          
- Root Mean Squared Error: 2.2575
- Mean Absolute Error: 0.6779
- R2 Score: 0.6842
########################################
Random Forest Regressor Performance
          Training Data          
- Root Mean Squared Error: 0.8972
- Mean Absolute Error: 0.4359
- R2 Score: 0.9476
----------------------------------------
          Test Data          
- Root Mean Squared Error: 2.3901
- Mean Absolu

In [30]:
#linear regression is the best performing model based on R2 Score
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2']).sort_values(by=["R2"],ascending=False)


Unnamed: 0,Model Name,R2
0,Linear Regression,0.726077
1,CatBoosting Regressor,0.684168
2,Random Forest Regressor,0.645983


In [31]:
lin_model = LinearRegression(fit_intercept=True)
lin_model = lin_model.fit(X_train, y_train)
y_pred = lin_model.predict(X_test)
score = r2_score(y_test, y_pred)*100
print(" Accuracy of the model is %.2f" %score)

 Accuracy of the model is 72.61


In [32]:
pred_df=pd.DataFrame({'Actual Value':y_test,'Predicted Value':y_pred,'Difference':y_test-y_pred})
pred_df

Unnamed: 0,Actual Value,Predicted Value,Difference
3133,67,67.187500,-0.187500
3895,68,67.890625,0.109375
6573,86,65.968750,20.031250
1500,65,65.398438,-0.398438
6581,88,62.453125,25.546875
...,...,...,...
3619,68,68.375000,-0.375000
1104,64,64.726562,-0.726562
5145,70,69.992188,0.007812
6159,72,71.843750,0.156250
