In [3]:
import pandas as pd

In [4]:
df = pd.read_csv("https://github.com/alexeygrigorev/datasets/raw/refs/heads/master/jamb_exam_results.csv")

In [5]:
df.head()
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer

In [7]:
df_prepare = df.drop('student_id',axis=1).fillna(0)

In [8]:
X = df_prepare.drop('jamb_score', axis=1)
y = df_prepare['jamb_score']

In [9]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [10]:
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.25, random_state=1)

In [11]:
dv = DictVectorizer(sparse=True)

In [12]:
X_train_selected = X_train.fillna(0).to_dict(orient='records')
X_val_selected = X_val.fillna(0).to_dict(orient='records')
    
X_train_transformed = dv.fit_transform(X_train_selected)
X_val_transformed = dv.transform(X_val_selected)
    

### Question 1

In [13]:
from sklearn.tree import DecisionTreeClassifier

In [14]:
dt = DecisionTreeClassifier(max_depth=1)

In [15]:
dt.fit(X_train_transformed, y_train)

In [16]:
from sklearn.tree import export_text

In [17]:
print(export_text(dt, feature_names=dv.get_feature_names_out()))

|--- study_hours_per_week <= 18.50
|   |--- class: 118
|--- study_hours_per_week >  18.50
|   |--- class: 190



### Question 2

In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

In [19]:
# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
model.fit(X_train_transformed, y_train)

# Predict on validation data
y_pred = model.predict(X_val_transformed)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print("A2 RMSE:", rmse)

A2 RMSE: 42.13724207871227


### Question 3

In [20]:
# Assuming X_train, X_val, y_train, y_val are already defined
rmse_values = []

# Loop through different values of n_estimators
for n in range(10, 201, 10):
    # Train the model with the current n_estimators
    model = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    model.fit(X_train_transformed, y_train)
    
    # Predict on validation data
    y_pred = model.predict(X_val_transformed)
    
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    rmse_values.append((n, rmse))

# Print results
for n, rmse in rmse_values:
    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

# Identify the point after which RMSE stops improving
for i in range(1, len(rmse_values)):
    if round(rmse_values[i][1], 3) >= round(rmse_values[i - 1][1], 3):
        print(f"A3 RMSE stops improving significantly after n_estimators = {rmse_values[i-1][0]}")
        break

n_estimators: 10, RMSE: 42.137
n_estimators: 20, RMSE: 41.461
n_estimators: 30, RMSE: 41.106
n_estimators: 40, RMSE: 40.917
n_estimators: 50, RMSE: 40.852
n_estimators: 60, RMSE: 40.784
n_estimators: 70, RMSE: 40.677
n_estimators: 80, RMSE: 40.539
n_estimators: 90, RMSE: 40.504
n_estimators: 100, RMSE: 40.517
n_estimators: 110, RMSE: 40.593
n_estimators: 120, RMSE: 40.625
n_estimators: 130, RMSE: 40.651
n_estimators: 140, RMSE: 40.595
n_estimators: 150, RMSE: 40.597
n_estimators: 160, RMSE: 40.604
n_estimators: 170, RMSE: 40.628
n_estimators: 180, RMSE: 40.641
n_estimators: 190, RMSE: 40.631
n_estimators: 200, RMSE: 40.601
A3 RMSE stops improving significantly after n_estimators = 90


In [21]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np

# Define max_depth and n_estimators ranges
max_depth_values = [10, 15, 20, 25]
n_estimators_values = range(10, 201, 10)

# Dictionary to store mean RMSE for each max_depth
mean_rmse_results = {}

# Loop through each max_depth
for max_depth in max_depth_values:
    rmse_list = []
    # Loop through each n_estimators value
    for n_estimators in n_estimators_values:
        # Train the model
        model = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=1, n_jobs=-1)
        model.fit(X_train_transformed, y_train)
        
        # Predict on validation data
        y_pred = model.predict(X_val_transformed)
        
        # Calculate RMSE
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    # Calculate mean RMSE for the current max_depth
    mean_rmse = np.mean(rmse_list)
    mean_rmse_results[max_depth] = mean_rmse
    print(f"max_depth: {max_depth}, Mean RMSE: {mean_rmse:.3f}")

# Find the max_depth with the lowest mean RMSE
best_max_depth = min(mean_rmse_results, key=mean_rmse_results.get)
print(f"Best max_depth based on mean RMSE: {best_max_depth}")


max_depth: 10, Mean RMSE: 40.392
max_depth: 15, Mean RMSE: 40.735
max_depth: 20, Mean RMSE: 40.740
max_depth: 25, Mean RMSE: 40.788
Best max_depth based on mean RMSE: 10


In [24]:
from sklearn.ensemble import RandomForestRegressor
import pandas as pd
import numpy as np

# Assuming you have a list of feature names corresponding to columns in X_train
feature_names = list(dv.get_feature_names_out())

# Initialize and train the RandomForestRegressor
model = RandomForestRegressor(n_estimators=10, max_depth=20, random_state=1, n_jobs=-1)
model.fit(X_train_transformed, y_train)

# Extract feature importances
feature_importances = model.feature_importances_

# Map the importances back to feature names
importance_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
importance_df = importance_df.sort_values(by='importance', ascending=False)

# Display the sorted feature importances
print("Feature importances:\n", importance_df)
print("Most important feature:", importance_df.iloc[0]['feature'])

Feature importances:
                              feature  importance
27              study_hours_per_week    0.248354
4                    attendance_rate    0.149729
5                 distance_to_school    0.136486
28                   teacher_quality    0.082682
2                                age    0.069311
3              assignments_completed    0.031517
24         socioeconomic_status=High    0.025714
17           parent_involvement=High    0.022919
10                 it_knowledge=High    0.017719
15  parent_education_level=Secondary    0.016957
14    parent_education_level=Primary    0.015450
16   parent_education_level=Tertiary    0.014489
6                 extra_tutorials=No    0.013459
18            parent_involvement=Low    0.013358
11                  it_knowledge=Low    0.012404
0    access_to_learning_materials=No    0.012325
19         parent_involvement=Medium    0.011492
25          socioeconomic_status=Low    0.010708
26       socioeconomic_status=Medium    0.01056

In [25]:
import xgboost as xgb
from sklearn.metrics import mean_squared_error
import numpy as np

# Assuming X_train, X_val, y_train, y_val are defined
# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train_transformed, label=y_train)
dval = xgb.DMatrix(X_val_transformed, label=y_val)

# Create a watchlist to monitor validation RMSE
watchlist = [(dtrain, 'train'), (dval, 'eval')]

# Define initial parameters
xgb_params = {
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

# Train model with eta = 0.3
xgb_params['eta'] = 0.3
model_eta_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
rmse_eta_03 = model_eta_03.best_score

# Train model with eta = 0.1
xgb_params['eta'] = 0.1
model_eta_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, early_stopping_rounds=10, verbose_eval=False)
rmse_eta_01 = model_eta_01.best_score

# Compare RMSE scores
print(f"RMSE with eta=0.3: {rmse_eta_03}")
print(f"RMSE with eta=0.1: {rmse_eta_01}")

# Determine the best eta based on RMSE
if rmse_eta_03 < rmse_eta_01:
    print("eta=0.3 gives the best RMSE score.")
elif rmse_eta_01 < rmse_eta_03:
    print("eta=0.1 gives the best RMSE score.")
else:
    print("Both eta=0.3 and eta=0.1 give equal RMSE scores.")

RMSE with eta=0.3: 40.43928983163436
RMSE with eta=0.1: 40.07263360603788
eta=0.1 gives the best RMSE score.
