In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import root_mean_squared_error
import numpy as np
import xgboost as xgb

In [75]:
df= pd.read_csv('jamb_exam_results.csv')

In [76]:
df.head()

Unnamed: 0,JAMB_Score,Study_Hours_Per_Week,Attendance_Rate,Teacher_Quality,Distance_To_School,School_Type,School_Location,Extra_Tutorials,Access_To_Learning_Materials,Parent_Involvement,IT_Knowledge,Student_ID,Age,Gender,Socioeconomic_Status,Parent_Education_Level,Assignments_Completed
0,192,22,78,4,12.4,Public,Urban,Yes,Yes,High,Medium,1,17,Male,Low,Tertiary,2
1,207,14,88,4,2.7,Public,Rural,No,Yes,High,High,2,15,Male,High,,1
2,182,29,87,2,9.6,Public,Rural,Yes,Yes,High,Medium,3,20,Female,High,Tertiary,2
3,210,29,99,2,2.6,Public,Urban,No,Yes,Medium,High,4,22,Female,Medium,Tertiary,1
4,199,12,98,3,8.8,Public,Urban,No,Yes,Medium,Medium,5,22,Female,Medium,Tertiary,1


In [77]:
#let's make columns lowercase:
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [78]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   jamb_score                    5000 non-null   int64  
 1   study_hours_per_week          5000 non-null   int64  
 2   attendance_rate               5000 non-null   int64  
 3   teacher_quality               5000 non-null   int64  
 4   distance_to_school            5000 non-null   float64
 5   school_type                   5000 non-null   object 
 6   school_location               5000 non-null   object 
 7   extra_tutorials               5000 non-null   object 
 8   access_to_learning_materials  5000 non-null   object 
 9   parent_involvement            5000 non-null   object 
 10  it_knowledge                  5000 non-null   object 
 11  student_id                    5000 non-null   int64  
 12  age                           5000 non-null   int64  
 13  gen

In [79]:
#Remove the student_id column:
del df['student_id']

In [80]:
#Fill missing values with zeros:
df.isna().sum()

jamb_score                        0
study_hours_per_week              0
attendance_rate                   0
teacher_quality                   0
distance_to_school                0
school_type                       0
school_location                   0
extra_tutorials                   0
access_to_learning_materials      0
parent_involvement                0
it_knowledge                      0
age                               0
gender                            0
socioeconomic_status              0
parent_education_level          891
assignments_completed             0
dtype: int64

In [81]:
df['jamb_score'].value_counts()

jamb_score
118    50
148    49
117    47
116    47
157    47
       ..
346     1
320     1
269     1
279     1
355     1
Name: count, Length: 220, dtype: int64

In [82]:
df['parent_education_level'] = df['parent_education_level'].fillna(0)

In [83]:
#Do train/validation/test split with 60%/20%/20% distribution:
#Use the train_test_split function and set the random_state parameter to 1.
#Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

In [84]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

In [85]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [86]:
y_train = df_train['jamb_score'].values
y_val = df_val['jamb_score'].values
y_test = df_test['jamb_score'].values

In [87]:
del df_train['jamb_score']
del df_val['jamb_score']
del df_test['jamb_score']

In [88]:
train_dict = df_train.fillna(0).to_dict(orient='records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dict)

In [89]:
train_dict[0]

{'study_hours_per_week': 20,
 'attendance_rate': 72,
 'teacher_quality': 3,
 'distance_to_school': 4.4,
 'school_type': 'Public',
 'school_location': 'Urban',
 'extra_tutorials': 'No',
 'access_to_learning_materials': 'Yes',
 'parent_involvement': 'Medium',
 'it_knowledge': 'Low',
 'age': 21,
 'gender': 'Female',
 'socioeconomic_status': 'Low',
 'parent_education_level': 0,
 'assignments_completed': 3}

In [90]:
val_dicts = df_val.fillna(0).to_dict(orient='records')
X_val = dv.transform(val_dicts)

### Question 1:

<!-- Let's train a decision tree regressor to predict the jamb_score variable.

Train a model with max_depth=1.
Which feature is used for splitting the data?

study_hours_per_week
attendance_rate
teacher_quality
distance_to_school -->

In [91]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

In [92]:
y_pred = dt.predict(X_train)


In [93]:
print(export_text(dt, feature_names=list(dv.get_feature_names_out())))

|--- study_hours_per_week <= 18.50
|   |--- value: [155.24]
|--- study_hours_per_week >  18.50
|   |--- value: [188.59]



### Question 2:
<!-- Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?

22.13
42.13
62.13
82.12 -->

In [94]:
rf_model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [95]:
y_pred_val_rf = rf_model.predict(X_val)

In [96]:
rmse_val_rf = root_mean_squared_error(y_val, y_pred_val_rf)
print(f'Validation RMSE (Random Forest): {round(rmse_val_rf, 2)}')

Validation RMSE (Random Forest): 42.14


### Question 3
<!-- Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

10
25
80
200 -->

In [97]:
n_estimators_values = range(10, 210, 10)  
rmse_results = {}

In [98]:
for n in n_estimators_values:
    rf_model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_val_rf = rf_model.predict(X_val)
    rmse_val_rf = root_mean_squared_error(y_val, y_pred_val_rf)
    rmse_results[n] = round(rmse_val_rf, 3)
    print(f'n_estimators={n}, Validation RMSE: {rmse_results[n]}')

n_estimators=10, Validation RMSE: 42.137
n_estimators=20, Validation RMSE: 41.461
n_estimators=30, Validation RMSE: 41.106
n_estimators=40, Validation RMSE: 40.917
n_estimators=50, Validation RMSE: 40.852
n_estimators=60, Validation RMSE: 40.784
n_estimators=70, Validation RMSE: 40.677
n_estimators=80, Validation RMSE: 40.539
n_estimators=90, Validation RMSE: 40.504
n_estimators=100, Validation RMSE: 40.517
n_estimators=110, Validation RMSE: 40.593
n_estimators=120, Validation RMSE: 40.625
n_estimators=130, Validation RMSE: 40.651
n_estimators=140, Validation RMSE: 40.595
n_estimators=150, Validation RMSE: 40.597
n_estimators=160, Validation RMSE: 40.604
n_estimators=170, Validation RMSE: 40.628
n_estimators=180, Validation RMSE: 40.641
n_estimators=190, Validation RMSE: 40.631
n_estimators=200, Validation RMSE: 40.601


In [99]:
rmse_values = list(rmse_results.values())
for i in range(1, len(rmse_values)):
    if rmse_values[i] >= rmse_values[i - 1]:  # No significant improvement
        print(f"RMSE stops improving after n_estimators={n_estimators_values[i-1]}")
        break

RMSE stops improving after n_estimators=90


### Question 4
<!-- Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1
What's the best max_depth, using the mean RMSE?

10
15
20
25 -->

In [100]:
max_depth_values = [10, 15, 20, 25]
rmse_results = {}

In [101]:
for max_depth in max_depth_values:
    rf_model = RandomForestRegressor(n_estimators=90, max_depth=max_depth, random_state=1, n_jobs=-1)
    rf_model.fit(X_train, y_train)
    y_pred_val_rf = rf_model.predict(X_val)

    rmse_val_rf = root_mean_squared_error(y_val, y_pred_val_rf)
    rmse_results[max_depth] = round(rmse_val_rf, 3)
    print(f'max_depth={max_depth}, Validation RMSE: {rmse_results[max_depth]}')


max_depth=10, Validation RMSE: 40.174
max_depth=15, Validation RMSE: 40.497
max_depth=20, Validation RMSE: 40.493
max_depth=25, Validation RMSE: 40.513


In [102]:
best_max_depth = min(rmse_results, key=rmse_results.get)
print(f"The best max_depth is {best_max_depth} with an RMSE of {rmse_results[best_max_depth]}")

The best max_depth is 10 with an RMSE of 40.174


### Question 5
<!-- We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

study_hours_per_week
attendance_rate
distance_to_school
teacher_quality -->

In [103]:
rf_model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
rf_model.fit(X_train, y_train)

In [104]:
df_importances = pd.DataFrame()
df_importances["feature"] = dv.feature_names_
df_importances["importance"] = rf_model.feature_importances_
df_importances.sort_values(by="importance", ascending=False).head()

Unnamed: 0,feature,importance
27,study_hours_per_week,0.248354
4,attendance_rate,0.149729
5,distance_to_school,0.136486
28,teacher_quality,0.082682
2,age,0.069311


### Question 6
<!-- Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

0.3
0.1
Both give equal value -->

In [105]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

In [106]:
watchlist = [(dtrain, 'train'), (dval, 'eval')]

In [107]:
def train_xgboost(eta_value):
    xgb_params = {
        'eta': eta_value,
        'max_depth': 6,
        'min_child_weight': 1,
        'objective': 'reg:squarederror',
        'nthread': 8,
        'seed': 1,
        'verbosity': 1
    }
    

    model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)

    y_pred_val = model.predict(dval)
    rmse_val = root_mean_squared_error(y_val, y_pred_val)
    return rmse_val


In [108]:
eta_values = [0.3, 0.1]
rmse_results = {}

In [109]:
for eta in eta_values:
    rmse = train_xgboost(eta)
    rmse_results[eta] = round(rmse, 3)
    print(f"Validation RMSE with eta={eta}: {rmse_results[eta]}")


Validation RMSE with eta=0.3: 41.228
Validation RMSE with eta=0.1: 40.2


In [110]:
best_eta = min(rmse_results, key=rmse_results.get)
print(f"The best eta value is {best_eta} with an RMSE of {rmse_results[best_eta]}")

The best eta value is 0.1 with an RMSE of 40.2
