In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import train_test_split, ParameterGrid

A `HistGradientBoostingRegressor` trained on only a subset of columns (all numerical) with hyperparameter tuning.

We begin by loading in the data and follow with deterministically sampling a train-val-test split (70-10-20).

Subsequently, we find a promising set of parameters using the validation set and proceed to train and evaluate our model.

In [2]:
colleges = pd.read_csv('../../input/colleges/colleges.csv')
colleges.head()

Unnamed: 0,UNITID,school_name,city,state,zip,school_webpage,latitude,longitude,admission_rate,sat_verbal_midrange,...,carnegie_undergraduate,carnegie_size,religious_affiliation,percent_female,agege24,faminc,mean_earnings_6_years,median_earnings_6_years,mean_earnings_10_years,median_earnings_10_years
0,100654,'Alabama A & M University',Normal,AL,35762,www.aamu.edu/,34.7834,-86.5685,0.8989,410.0,...,'Full-time four-year inclusive','Medium 4-year highly residential (3000 to 9999)',?,0.52999997138977,0.07999999821186,40211.22,26100.0,22800.0,35300.0,31400.0
1,100663,'University of Alabama at Birmingham',Birmingham,AL,35294-0110,www.uab.edu,33.5022,-86.8092,0.8673,580.0,...,'Medium full-time four-year selective higher t...,'Large 4-year primarily nonresidential (over 9...,?,0.64999997615814,0.25999999046325,49894.65,37400.0,33200.0,46300.0,40300.0
2,100690,'Amridge University',Montgomery,AL,36117-3553,www.amridgeuniversity.edu,32.3626,-86.17399999999999,?,?,...,'Medium full-time four-year inclusivestudents ...,'Very small 4-year primarily nonresidential (l...,'Churches of Christ',0.50999999046325,0.82999998331069,38712.18,38500.0,32800.0,42100.0,38100.0
3,100706,'University of Alabama in Huntsville',Huntsville,AL,35899,www.uah.edu,34.7228,-86.6384,0.8062,575.0,...,'Medium full-time four-year selective higher t...,'Medium 4-year primarily nonresidential (3000 ...,?,0.55000001192092,0.28999999165534,54155.4,39300.0,36700.0,52700.0,46600.0
4,100724,'Alabama State University',Montgomery,AL,36104-0271,www.alasu.edu/email/index.aspx,32.3643,-86.2957,0.5125,430.0,...,'Full-time four-year inclusive','Medium 4-year primarily residential (3000 to ...,?,0.56999999284744,0.10999999940395,31846.99,21200.0,19300.0,30700.0,27800.0


In [3]:
numeric_columns = [
 'undergrad_size',
 'spend_per_student',
 'admission_rate',
 'mean_earnings_6_years',
 'median_earnings_6_years',
 'sat_math_midrange',
 'sat_verbal_midrange',
 'act_math_midrange',
 'act_writing_midrange',
 'latitude',
 'longitude',
 'completion_rate',
 'tuition_(out_of_state)',
 'tuition_(instate)',
 'percent_white',
 'percent_black',
 'percent_hispanic',
 'percent_asian',
 'median_earnings_10_years',
 'sat_total_average',
 'completion_rate',
 'faculty_salary',
 'percent_female',
 'percent_part_time',
 'agege24',
 'faminc',
 'average_cost_program_year',
 'average_cost_academic_year']

In [4]:
for col in numeric_columns:
    colleges[col] = pd.to_numeric(colleges[col], errors='coerce')

In [5]:
targets = colleges['percent_pell_grant'].values

In [6]:
colleges[numeric_columns].values.shape, targets.shape

((7063, 28), (7063,))

In [7]:
np.random.seed(0) # making the split deterministic, important for reporting results for different models

In [8]:
X_train, X_val_and_test, y_train, y_val_and_test = train_test_split(colleges[numeric_columns].values, targets, test_size=0.3)

In [9]:
X_val, X_test, y_val, y_test = train_test_split(X_val_and_test, y_val_and_test, test_size=0.66)

In [10]:
param_grid = ParameterGrid({
    'max_depth': [4, 5, 6],
    'max_iter': [650, 700, 750],
    'learning_rate': [0.7, 0.075, 0.8],
    'min_samples_leaf': [27, 30, 33], 
    'max_leaf_nodes': [18, 21, 24]}
)

In [11]:
%%time

smallest_error = np.inf
best_params = None
for params in param_grid:
    clf = HistGradientBoostingRegressor(**params, validation_fraction=None)
    clf.fit(X_train, y_train)
    preds = clf.predict(X_val)
    error = mean_squared_error(y_val, preds, squared=True)
    if error < smallest_error:
        smallest_error = error
        best_params = params

CPU times: user 1h 31min 15s, sys: 9.99 s, total: 1h 31min 25s
Wall time: 3min 50s


In [12]:
print("Validation set error:", smallest_error*1000)
print("Best params:", best_params)

Validation set error: 22.97165171196467
Best params: {'learning_rate': 0.075, 'max_depth': 5, 'max_iter': 700, 'max_leaf_nodes': 21, 'min_samples_leaf': 30}


In [16]:
%%time

clf = HistGradientBoostingRegressor(**best_params, validation_fraction=None)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
test_set_error = mean_squared_error(y_test, preds, squared=True)

CPU times: user 23.4 s, sys: 19.9 ms, total: 23.4 s
Wall time: 1 s


In [17]:
print("Test set error:", test_set_error*1000)

Test set error: 22.171433294089248
