In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

from sklearn.tree import plot_tree
import matplotlib.pyplot as plt


In [12]:
data = pd.read_csv('Student_Performance.csv')
data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0


In [13]:
data['Extracurricular Activities'].fillna(data['Extracurricular Activities'].mode()[0], inplace=True)
data['Extracurricular Activities'] = data['Extracurricular Activities'].map({'Yes': 1, 'No': 0})


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['Extracurricular Activities'].fillna(data['Extracurricular Activities'].mode()[0], inplace=True)


In [14]:
data.head()

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,1,9,1,91.0
1,4,82,0,4,2,65.0
2,8,51,1,7,2,45.0
3,5,52,1,5,2,36.0
4,7,75,0,8,5,66.0


In [15]:
X = data[['Hours Studied', 'Previous Scores', 'Extracurricular Activities', 
          'Sleep Hours', 'Sample Question Papers Practiced']]
y = data['Performance Index']


In [16]:

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


Model Initialization before Hyperparamter Tuning using GridSearchCV

In [17]:
rf_model = RandomForestRegressor(random_state=42, n_estimators=100, max_depth=5)
rf_model.fit(X_train, y_train)


In [18]:
y_pred_rf = rf_model.predict(X_test)


In [19]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R^2 Score: {r2_rf}")


Random Forest Mean Squared Error: 8.740231318377763
Random Forest R^2 Score: 0.9764150503055342


Understanding Feature Importance

In [20]:
importance = rf_model.feature_importances_
feature_importance = pd.DataFrame({'Feature': X.columns, 'Importance': importance})
print(feature_importance.sort_values(by='Importance', ascending=False))


                            Feature  Importance
1                   Previous Scores    0.861559
0                     Hours Studied    0.138441
2        Extracurricular Activities    0.000000
3                       Sleep Hours    0.000000
4  Sample Question Papers Practiced    0.000000


Hyperparamter Tuning Using GridSearchCV

In [21]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
grid_search_rf = GridSearchCV(RandomForestRegressor(random_state=42), param_grid=param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search_rf.fit(X_train, y_train)
best_rf_model = grid_search_rf.best_estimator_
print(grid_search_rf.best_params_)


{'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 200}


Initializing new model with new paramters

In [30]:
rf_model_new= RandomForestRegressor(random_state=42, n_estimators=200, max_depth=10,min_samples_leaf=4,min_samples_split=10)
rf_model_new.fit(X_train, y_train)
y_pred_rf_new = rf_model_new.predict(X_test)


In [31]:
mse_rf = mean_squared_error(y_test, y_pred_rf_new)
r2_rf = r2_score(y_test, y_pred_rf_new)
print(f"Random Forest Mean Squared Error: {mse_rf}")
print(f"Random Forest R^2 Score: {r2_rf}")


Random Forest Mean Squared Error: 4.6664782462856165
Random Forest R^2 Score: 0.987407809853093
