## Dataset

#!wget https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv

## Preparing the dataset

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
!head jamb_exam_results.csv

JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,None,1
182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1
202,25,85,2,13.6,Public,Urban,Yes,No,Medium,Low,6,15,Male,Low,Tertiary,1
251,35,85,4,2.6,Public,Urban,No,Yes,Low,Medium,7,16,Female,Medium,Primary,4
129,27,75,3,9.4,Public,Urban,No,Yes,Low,Medium,8,19,Female,Low,Tertiary,3
220,23,85,3,4.6,Public,Rural,No,No,Low,Medium,9,19,Female,Medium,Tertiary,1


In [3]:
df = pd.read_csv('jamb_exam_results.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [4]:
df.head()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,student_id,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [5]:
numeric = []
objects = []
dict = {}
for col, val in df.dtypes.items():
    if val == "object":
        objects.append(col)
        dict[col] = df[col].unique()
    else:
        numeric.append(col)

In [6]:
numeric

['jamb_score',
 'study_hours_per_week',
 'attendance_rate',
 'teacher_quality',
 'distance_to_school',
 'student_id',
 'age',
 'assignments_completed']

In [7]:
dict

{'school_type': array(['Public', 'Private'], dtype=object),
 'school_location': array(['Urban', 'Rural'], dtype=object),
 'extra_tutorials': array(['Yes', 'No'], dtype=object),
 'access_to_learning_materials': array(['Yes', 'No'], dtype=object),
 'parent_involvement': array(['High', 'Medium', 'Low'], dtype=object),
 'it_knowledge': array(['Medium', 'High', 'Low'], dtype=object),
 'gender': array(['Male', 'Female'], dtype=object),
 'socioeconomic_status': array(['Low', 'High', 'Medium'], dtype=object),
 'parent_education_level': array(['Tertiary', nan, 'Primary', 'Secondary'], dtype=object)}

Preparation:

- Remove the student_id column.
- Fill missing values with zeros.
- Do train/validation/test split with 60%/20%/20% distribution.
- Use the train_test_split function and set the random_state parameter to 1.
- Use DictVectorizer(sparse=True) to turn the dataframes into matrices.


In [8]:
del df['student_id']

In [9]:
df.fillna(0)
df['parent_education_level'].fillna("NA",axis= 0)
df.isnull().sum(axis = 0)
df['parent_education_level'].head()

0    Tertiary
1         NaN
2    Tertiary
3    Tertiary
4    Tertiary
Name: parent_education_level, dtype: object

In [10]:
from sklearn.model_selection import train_test_split
random_state = 1
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=random_state)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=random_state)

In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [12]:
dv = DictVectorizer(sparse=True)

## Question 1

Let's train a decision tree regressor to predict the jamb_score variable.

- Train a model with max_depth=1.

Which feature is used for splitting the data?

- study_hours_per_week
- attendance_rate
- teacher_quality
- distance_to_school

In [13]:
jamb_score = 'jamb_score'
y_train = df_train[jamb_score]
y_val = df_val[jamb_score]
y_test = df_test[jamb_score]
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
del df_train[jamb_score]
del df_val[jamb_score]
del df_test[jamb_score]

In [14]:
features = ["study_hours_per_week","attendance_rate","teacher_quality","distance_to_school"]
df_train.head()

Unnamed: 0,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,school_type,school_location,extra_tutorials,access_to_learning_materials,parent_involvement,it_knowledge,age,gender,socioeconomic_status,parent_education_level,assignments_completed
0,20,72,3,4.4,Public,Urban,No,Yes,Medium,Low,21,Female,Low,,3
1,11,80,2,3.3,Public,Urban,Yes,Yes,Medium,High,22,Female,Medium,Secondary,1
2,31,82,1,8.3,Public,Urban,Yes,Yes,Low,High,19,Female,High,Tertiary,2
3,29,79,1,15.8,Public,Rural,Yes,Yes,Low,Low,19,Male,Low,Primary,2
4,28,96,2,8.9,Private,Rural,Yes,Yes,Medium,Low,19,Male,High,Secondary,3


In [15]:
df.describe().round()

Unnamed: 0,jamb_score,study_hours_per_week,attendance_rate,teacher_quality,distance_to_school,age,assignments_completed
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,174.0,20.0,84.0,3.0,10.0,18.0,2.0
std,48.0,10.0,9.0,1.0,5.0,2.0,1.0
min,100.0,0.0,50.0,1.0,0.0,15.0,1.0
25%,135.0,13.0,78.0,2.0,7.0,16.0,1.0
50%,170.0,19.0,84.0,2.0,10.0,18.0,1.0
75%,209.0,26.0,91.0,3.0,13.0,20.0,2.0
max,367.0,40.0,100.0,5.0,20.0,22.0,5.0


In [16]:
dt = DecisionTreeClassifier(max_depth=1)

In [17]:
X_train = dv.fit_transform(df_train.fillna(0).to_dict(orient='records'))
dt.fit(X_train, y_train)

In [18]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- class: 118
|--- study_hours_per_week >  18.50
|   |--- class: 190



## Question 2

train a random forest model with these parameters:

- n_estimators=10
- random_state=1
- n_jobs=-1 (optional - to make training faster)

What's the RMSE of this model on validation?

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
rf = RandomForestClassifier(n_estimators=10, random_state=1,n_jobs=-1)
rf.fit(X_train, y_train)
X_val = dv.fit_transform(df_val.fillna(0).to_dict(orient='records'))
y_pred = rf.predict_proba(X_val)[:, 1]

In [21]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
rmse

182.43461034025313

## Question3
Now let's experiment with the n_estimators parameter

- Try different values of this parameter from 10 to 200 with step 10.
- Set random_state to 1.
- Evaluate the model on the validation dataset.

After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

In [22]:
errors = []
estimators = np.arange (10, 201, 10)
for n in estimators:
    rf = RandomForestClassifier(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict_proba(X_val)[:, 1]
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    errors.append((n , rmse))

In [23]:
t1 = errors[0]
for i in range(1,len(errors)):
    current = errors[i]
    diff = current[1]- t1[1]
    #print(current[0],current[1]- t1[1])
    t1 = current
    if diff > 0:
        stopping_n = i + 1
        break
print(f"\nThe RMSE stops improving significantly after n_estimators = {stopping_n * 10 + 10}")      


The RMSE stops improving significantly after n_estimators = 80


In [24]:
# df_scores = pd.DataFrame(errors, columns=['n_estimators', 'rmse'])
# plt.plot(df_scores.n_estimators, df_scores.rmse)
# plt.show()

##  Question4 
Let's select the best max_depth:

- Try different values of max_depth: [10, 15, 20, 25]
- For each of these values,
    - try different values of n_estimators from 10 till 200 (with step 10)
    - calculate the mean RMSE
- Fix the random seed: random_state=1

What's the best max_depth, using the mean RMSE?



In [25]:
errors = []
for d in [10, 15, 20, 25]:
    for n in range(10, 201, 10):
        rf = RandomForestClassifier(n_estimators=n,
                                    max_depth=d,
                                    random_state=1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict_proba(X_val)[:, 1]
        mse = mean_squared_error(y_val, y_pred)
        rmse = np.sqrt(mse)
        errors.append((d, n, rmse))

In [26]:
errors
min_rmse = min(errors, key=lambda x: x[2])
# min_rmse
print(f"\nThe best max_depth, n_estimators, rmse: {min_rmse}")  


The best max_depth, n_estimators, rmse: (10, 60, 182.4320595031163)


## Question5

In [27]:
rf = RandomForestClassifier(n_estimators=10,max_depth=20, random_state=1)
feature_names = ['study_hours_per_week', 'attendance_rate', 'distance_to_school', 'teacher_quality']
rf.fit(df_train[feature_names], y_train)


In [28]:
feature_importance_df = pd.DataFrame({
    'Feature names': feature_names,
    'Importance': rf.feature_importances_
}).sort_values(by='Importance', ascending=False)

# Display the feature importance
print(f" The most important feature: {feature_importance_df.iloc[0]['Feature names']}")

 The most important feature: distance_to_school


In [29]:
#!pip install xgboost

In [30]:
import xgboost as xgb

In [31]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [32]:
etas = [0.3, 0.1]
for eta in etas:
    xgb_params = {
        'eta': eta, 
        'max_depth': 6,
        'min_child_weight': 1,
        
        'objective': 'reg:squarederror',
        'nthread': 8,
        
        'seed': 1,
        'verbosity': 1,
    }
    model = xgb.train(xgb_params, dtrain, num_boost_round=100)
    rf.fit(X_train, y_train)
    y_pred = rf.predict_proba(X_val)[:, 1]
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    print(eta,":", rmse)

0.3 : 182.433913205577
0.1 : 182.433913205577
