# Assignment 3: AutoML
## Peter Ye

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the original CSV
athletes_df = pd.read_csv('athletes.csv')

## Pre-processing to create different feature views to simulate

In [3]:
# Dropping rows with null values
athletes_df = athletes_df.dropna(subset=['region','age','weight','height','howlong','gender','eat',
                               'train','background','experience','schedule','howlong',
                               'deadlift','candj','snatch','backsq','experience',
                               'background','schedule','howlong'])

In [4]:
# Dropping irrelevant columns
athletes_df = athletes_df.drop(columns=['affiliate','team','name','fran','helen','grace',
                              'filthy50','fgonebad','run400','run5k','pullups','train'])

In [5]:
# Remove outliers
athletes_df = athletes_df[athletes_df['weight'] < 1500]
athletes_df = athletes_df[athletes_df['gender'] != '--']
athletes_df = athletes_df[athletes_df['age'] >= 18]
athletes_df = athletes_df[(athletes_df['height'] < 96) & (athletes_df['height'] > 48)]
athletes_df = athletes_df[(athletes_df['deadlift'] > 0) & 
                ((athletes_df['deadlift'] <= 1105) | 
                ((athletes_df['gender'] == 'Female') & (athletes_df['deadlift'] <= 636)))]
athletes_df = athletes_df[(athletes_df['candj'] > 0) & (athletes_df['candj'] <= 395)]
athletes_df = athletes_df[(athletes_df['snatch'] > 0) & (athletes_df['snatch'] <= 496)]
athletes_df = athletes_df[(athletes_df['backsq'] > 0) & (athletes_df['backsq'] <= 1069)]

In [6]:
# Clean survey data
decline_dict = {'Decline to answer|': np.nan}
athletes_df = athletes_df.replace(decline_dict)
athletes_df = athletes_df.dropna(subset=['background','experience','schedule','howlong','eat'])

In [7]:
# Create 'total_lift' column
athletes_df['total_lift'] = athletes_df['candj'] + athletes_df['snatch'] + athletes_df['deadlift'] + athletes_df['backsq']

In [8]:
athletes_df.head()

Unnamed: 0,athlete_id,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
21,21269.0,Southern California,Male,30.0,71.0,200.0,235.0,175.0,385.0,315.0,I eat whatever is convenient|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 1x a week|I ty...,1-2 years|,1110.0
22,21685.0,Africa,Male,28.0,70.0,176.0,187.0,134.0,335.0,254.0,I eat 1-3 full cheat meals per week|,I have no athletic background besides CrossFit|,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 1x a week|,2-4 years|,910.0
27,25464.0,North East,Male,35.0,68.0,225.0,285.0,205.0,440.0,405.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|,I began CrossFit with a coach (e.g. at an affi...,I typically rest 4 or more days per month|,2-4 years|,1335.0
50,43767.0,North Central,Male,36.0,71.0,199.0,267.0,212.0,485.0,390.0,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,1-2 years|,1354.0
60,55504.0,North East,Male,36.0,64.0,155.0,245.0,180.0,415.0,385.0,I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 2x a week|I st...,4+ years|,1225.0


In [9]:
# Adding new columns based on the specified conditions
athletes_df['experience_start_with_coach'] = athletes_df['experience'].apply(
    lambda x: 1 if 'I began CrossFit with a coach' in x else 0
)

athletes_df['experience_have_certificate'] = athletes_df['experience'].apply(
    lambda x: 1 if 'I have completed the CrossFit Level 1 certificate course' in x else 0
)

athletes_df['eat_on_diet'] = athletes_df['eat'].apply(
    lambda x: 1 if 'I eat strict Paleo' in x else 0
)

In [11]:
# Convert gender to binary number
athletes_df['gender'] = athletes_df['gender'].map({'Male': 1, 'Female': 0})

In [12]:
from sklearn.preprocessing import MinMaxScaler

# Initialize the scaler
scaler = MinMaxScaler()

# Select columns to normalize
athletes_df[['age', 'height', 'weight']] = scaler.fit_transform(athletes_df[['age', 'height', 'weight']])

In [13]:
athletes_df = athletes_df[['age', 'height', 'weight', 'gender', 'experience_start_with_coach', 'experience_have_certificate', 'eat_on_diet', 'total_lift']]

In [16]:
athletes_df.head()

Unnamed: 0,age,height,weight,gender,experience_start_with_coach,experience_have_certificate,eat_on_diet,total_lift
21,0.315789,0.612903,0.415778,1,0,0,0,1110.0
22,0.263158,0.580645,0.364606,1,1,0,0,910.0
27,0.447368,0.516129,0.469083,1,1,0,0,1335.0
50,0.473684,0.612903,0.413646,1,1,1,0,1354.0
60,0.473684,0.387097,0.319829,1,0,1,1,1225.0


## autogluon

In [15]:
from autogluon.tabular import TabularPredictor

In [17]:
# Define feature columns and target variable
feature_columns = [
    'age', 'height', 'weight', 'experience_start_with_coach',
    'experience_have_certificate', 'eat_on_diet', 'gender'
]
target_column = 'total_lift'

# Prepare data for AutoGluon
data = athletes_df[feature_columns + [target_column]]

### 2&3

In [21]:
# Split data into training and testing sets
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(
    data, test_size=0.2, random_state=42
)

# Run AutoML with all features
print("Running AutoGluon AutoML with all features...")
predictor = TabularPredictor(
    label=target_column, eval_metric='r2', verbosity=0
).fit(
    train_data=train_data,
    time_limit=360,  # Total time in seconds
    presets='best_quality',
    ag_args_fit={'num_gpus': 1}
)

Running AutoGluon AutoML with all features...


  stacked_overfitting = self._sub_fit_memory_save_wrapper(
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree L

In [22]:
# Evaluate the AutoML model on the test set
performance = predictor.evaluate(test_data)
print("\nTest Performance (All Features):")
print(performance)


Test Performance (All Features):
{'r2': 0.685087073657951, 'root_mean_squared_error': np.float64(-156.5078415640652), 'mean_squared_error': np.float64(-24494.70447104253), 'mean_absolute_error': np.float64(-121.21944970304561), 'pearsonr': 0.8278431818012704, 'median_absolute_error': np.float64(-99.674560546875)}


### 4

In [23]:
# Get feature importances
print("\nCalculating feature importances...")
feature_importance = predictor.feature_importance(test_data, subsample_size=500, silent=True)
print("\nFeature Importances:")
print(feature_importance)


Calculating feature importances...

Feature Importances:
                             importance    stddev   p_value  n  p99_high  \
gender                         0.539779  0.126475  0.000337  5  0.800193   
weight                         0.355942  0.053110  0.000058  5  0.465296   
experience_have_certificate    0.063289  0.006050  0.000010  5  0.075745   
age                            0.062452  0.009767  0.000070  5  0.082563   
height                         0.041544  0.004845  0.000022  5  0.051521   
experience_start_with_coach    0.006413  0.003563  0.007901  5  0.013750   
eat_on_diet                    0.000586  0.000624  0.051834  5  0.001871   

                              p99_low  
gender                       0.279365  
weight                       0.246588  
experience_have_certificate  0.050832  
age                          0.042340  
height                       0.031567  
experience_start_with_coach -0.000923  
eat_on_diet                 -0.000699  


In [24]:
# Get the top 5 features
top_5_features = feature_importance.index[:5].tolist()
print(f"\nTop 5 Features: {top_5_features}")


Top 5 Features: ['gender', 'weight', 'experience_have_certificate', 'age', 'height']


### 5

In [25]:
# Prepare data using only the top 3 features
top_3_features = feature_importance.index[:3].tolist()
print(f"Using Top 3 Features: {top_3_features}")

# Prepare datasets with top 3 features
train_data_top3 = train_data[top_3_features + [target_column]]
test_data_top3 = test_data[top_3_features + [target_column]]

Using Top 3 Features: ['gender', 'weight', 'experience_have_certificate']


In [27]:
# Run AutoML with top 3 features
print("\nRunning AutoGluon AutoML with top 3 features...")
predictor_top3 = TabularPredictor(
    label=target_column, eval_metric='r2', verbosity=0
).fit(
    train_data=train_data_top3,
    time_limit=360,
    presets='best_quality',
     ag_args_fit={'num_gpus': 1}
)


Running AutoGluon AutoML with top 3 features...


  stacked_overfitting = self._sub_fit_memory_save_wrapper(
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] GPU Tree L

In [28]:
# Evaluate the AutoML model with top 3 features on the test set
performance_top3 = predictor_top3.evaluate(test_data_top3)
print("\nTest Performance (Top 3 Features):")
print(performance_top3)


Test Performance (Top 3 Features):
{'r2': 0.6279093570625114, 'root_mean_squared_error': np.float64(-170.12386180698851), 'mean_squared_error': np.float64(-28942.12835612332), 'mean_absolute_error': np.float64(-132.35145413539973), 'pearsonr': 0.7925964433294088, 'median_absolute_error': np.float64(-109.057861328125)}


In [29]:
# Extracting model leaderboard
leaderboard = predictor.leaderboard(test_data, silent=True)
leaderboard_top3 = predictor_top3.leaderboard(test_data_top3, silent=True)

# Top 3 models per validation score (All Features)
top3_models_score = leaderboard.sort_values(
    by='score_val', ascending=False
).head(3)
print("\nTop 3 Models per Validation Score (All Features):")
print(top3_models_score[['model', 'score_val']])


Top 3 Models per Validation Score (All Features):
                   model  score_val
0    WeightedEnsemble_L2   0.691112
1  NeuralNetTorch_BAG_L1   0.689125
2        LightGBM_BAG_L1   0.688327


In [30]:
# Top 3 models per validation score (Top 3 Features)
top3_models_score_top3 = leaderboard_top3.sort_values(
    by='score_val', ascending=False
).head(3)
print("\nTop 3 Models per Validation Score (Top 3 Features):")
print(top3_models_score_top3[['model', 'score_val']])


Top 3 Models per Validation Score (Top 3 Features):
                 model  score_val
0  WeightedEnsemble_L2   0.627594
1    LightGBMXT_BAG_L1   0.627152
2      LightGBM_BAG_L1   0.626493


### 6

In [31]:
# Top 3 models per speed (All Features)
top3_models_speed = leaderboard.sort_values(
    by='pred_time_val'
).head(3)
print("\nTop 3 Models per Speed (All Features):")
print(top3_models_speed[['model', 'pred_time_val']])


Top 3 Models per Speed (All Features):
                   model  pred_time_val
2        LightGBM_BAG_L1       0.021980
7  KNeighborsUnif_BAG_L1       0.051594
3      LightGBMXT_BAG_L1       0.069449


In [32]:
# Top 3 models per speed (Top 3 Features)
top3_models_speed_top3 = leaderboard_top3.sort_values(
    by='pred_time_val'
).head(3)
print("\nTop 3 Models per Speed (Top 3 Features):")
print(top3_models_speed_top3[['model', 'pred_time_val']])


Top 3 Models per Speed (Top 3 Features):
                  model  pred_time_val
2       LightGBM_BAG_L1       0.016299
5  LightGBMLarge_BAG_L1       0.023060
1     LightGBMXT_BAG_L1       0.044685


### 7

* The best model (WeightedEnsemble_L2) which has the best 0.691112 validation score outperforms previous models which only have 0.62 as best score
* The best model (LightGBM_BAG_L1) which has the best 0.691112 prediction time can't tell a difference between previous models. The validation speed doesn't really matter

### 8

The autogluon is open-source and low-code platform. From the experiment, we don't have to explicitly write down codes defining the model pipeline, category, and hyper-parameters. We just need to tell the platform what are targeted variable and explanatory variables, compared to traditional ML package like sklearn

## h2o

In [33]:
import h2o
from h2o.automl import H2OAutoML

### 2 & 3

In [34]:
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.25" 2024-10-15; OpenJDK Runtime Environment (build 11.0.25+9-post-Ubuntu-1ubuntu122.04); OpenJDK 64-Bit Server VM (build 11.0.25+9-post-Ubuntu-1ubuntu122.04, mixed mode, sharing)
  Starting server from /home/yzysnake/miniconda3/envs/Mlop/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmpj36w6dam
  JVM stdout: /tmp/tmpj36w6dam/h2o_yzysnake_started_from_python.out
  JVM stderr: /tmp/tmpj36w6dam/h2o_yzysnake_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,01 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.6
H2O_cluster_version_age:,10 days
H2O_cluster_name:,H2O_from_python_yzysnake_oq2gwv
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,7.803 Gb
H2O_cluster_total_cores:,32
H2O_cluster_allowed_cores:,32


In [35]:
# Convert pandas DataFrame to H2O Frame
data = h2o.H2OFrame(athletes_df)

# Define feature columns and target variable
feature_columns = [
    'age', 'height', 'weight', 'experience_start_with_coach',
    'experience_have_certificate', 'eat_on_diet', 'gender'
]
target_column = 'total_lift'

# Ensure the target variable is numeric
data[target_column] = data[target_column].asnumeric()

# Split the data into training and testing sets
train, test = data.split_frame(ratios=[0.8], seed=42)

# Run H2O AutoML with all features
aml = H2OAutoML(
    max_runtime_secs=360,      # Total time in seconds
    seed=42,
    exclude_algos=["DeepLearning"]  # Exclude if GPU not available
)

print("Running H2O AutoML with all features...")
aml.train(
    x=feature_columns,
    y=target_column,
    training_frame=train
)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Running H2O AutoML with all features...
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),20/44
# GBM base models (used / total),18/36
# XGBoost base models (used / total),2/5
# GLM base models (used / total),0/1
# DRF base models (used / total),0/2
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,62353.59,558.67554,62579.92,61501.004,63002.85,62186.293,62497.87
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,119.41707,2.0313985,119.01283,120.53866,120.697334,116.0255,120.81102
mean_residual_deviance,24086.396,933.9532,23746.18,24667.213,24806.812,22592.182,24619.598
mse,24086.396,933.9532,23746.18,24667.213,24806.812,22592.182,24619.598
null_deviance,372008768.0,6977288.5,383281920.0,371502848.0,365706912.0,366800608.0,372751520.0
r2,0.6876925,0.0134161,0.6999237,0.6846483,0.6703911,0.7024462,0.6810528
residual_deviance,116093624.0,4391832.5,115002752.0,117045928.0,120536296.0,109142832.0,118740320.0
rmse,155.17421,3.033766,154.09796,157.058,157.50179,150.30696,156.90634
rmsle,0.1870964,0.0187109,0.1781198,0.2199601,0.1847949,0.1754289,0.1771784


In [36]:
# View the AutoML leaderboard
leaderboard = aml.leaderboard
print("\nAutoML Leaderboard (All Features):")
print(leaderboard.head(rows=10))


AutoML Leaderboard (All Features):
model_id                                                             rmse      mse      mae     rmsle    mean_residual_deviance
StackedEnsemble_AllModels_5_AutoML_1_20241112_220908              155.201  24087.4  119.358  0.187681                   24087.4
StackedEnsemble_BestOfFamily_6_AutoML_1_20241112_220908           155.322  24124.8  119.463  0.187872                   24124.8
StackedEnsemble_BestOfFamily_4_AutoML_1_20241112_220908           155.413  24153.2  119.527  0.187955                   24153.2
StackedEnsemble_Best1000_1_AutoML_1_20241112_220908               155.415  24153.8  119.471  0.187911                   24153.8
StackedEnsemble_AllModels_3_AutoML_1_20241112_220908              155.434  24159.8  119.531  0.187948                   24159.8
StackedEnsemble_BestOfFamily_3_AutoML_1_20241112_220908           155.459  24167.6  119.566  0.187872                   24167.6
StackedEnsemble_AllModels_2_AutoML_1_20241112_220908              15

### 4

In [37]:
# Evaluate model performance on test data
perf = aml.leader.model_performance(test_data=test)
print("\nModel Performance on Test Data:")
print(perf)

# Get feature importance for the leader model
print("\nCalculating feature importances...")
if 'variable_importances' in aml.leader._model_json['output']:
    importances = aml.leader.varimp(use_pandas=True)
    print("\nFeature Importances:")
    print(importances)
else:
    print("Feature importance not available for the top model.")

# Get top 5 features
if 'importances' in locals():
    top_5_features = importances['variable'][:5].tolist()
    print(f"\nTop 5 Features: {top_5_features}")
else:
    # If feature importance is not available, use all features
    top_5_features = feature_columns
    print("\nUsing all features as top features.")


Model Performance on Test Data:
ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 23837.096097979236
RMSE: 154.39266853701065
MAE: 119.28531397122806
RMSLE: 0.17650166157052083
Mean Residual Deviance: 23837.096097979236
R^2: 0.6887005254128638
Null degrees of freedom: 5913
Residual degrees of freedom: 5893
Null deviance: 452858790.6387163
Residual deviance: 140972586.3234492
AIC: 76434.40095118352

Calculating feature importances...
Feature importance not available for the top model.

Using all features as top features.


### 5

In [38]:
# Prepare data using only the top 3 features
top_3_features = top_5_features[:3]
print(f"Using Top 3 Features: {top_3_features}")

# Run H2O AutoML with top 3 features
aml_top3 = H2OAutoML(
    max_runtime_secs=360,
    seed=42,
    exclude_algos=["DeepLearning"]  # Exclude if GPU not available
)

print("\nRunning H2O AutoML with top 3 features...")
aml_top3.train(
    x=top_3_features,
    y=target_column,
    training_frame=train
)

Using Top 3 Features: ['age', 'height', 'weight']

Running H2O AutoML with top 3 features...
AutoML progress: |███████████████████████████████████████████████████████████████| (done) 100%


key,value
Stacking strategy,cross_validation
Number of base models (used / total),14/44
# GBM base models (used / total),12/36
# XGBoost base models (used / total),2/5
# DRF base models (used / total),0/2
# GLM base models (used / total),0/1
Metalearner algorithm,GLM
Metalearner fold assignment scheme,Random
Metalearner nfolds,5
Metalearner fold_column,

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,64122.35,604.31805,64370.71,63130.195,64754.97,64113.29,64242.59
loglikelihood,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mae,144.43552,1.4548091,143.61261,143.95007,146.1337,142.72513,145.75604
mean_residual_deviance,34829.91,716.8769,34427.03,34875.15,35665.38,33856.445,35325.54
mse,34829.91,716.8769,34427.03,34875.15,35665.38,33856.445,35325.54
null_deviance,372003328.0,6143969.0,376695072.0,363077920.0,370424448.0,371041024.0,378778208.0
r2,0.5484429,0.0110087,0.5573731,0.5439957,0.5320914,0.5592475,0.5495068
residual_deviance,167889568.0,3930004.5,166730112.0,165482576.0,173298080.0,163526624.0,170410400.0
rmse,186.61981,1.9226604,185.54523,186.74889,188.8528,184.0012,187.9509
rmsle,0.2196389,0.0127611,0.2109691,0.2394518,0.2185897,0.2064797,0.2227042


In [39]:
# View the AutoML leaderboard for top 3 features
leaderboard_top3 = aml_top3.leaderboard
print("\nAutoML Leaderboard (Top 3 Features):")
print(leaderboard_top3.head(rows=10))


AutoML Leaderboard (Top 3 Features):
model_id                                                    rmse      mse      mae     rmsle    mean_residual_deviance
StackedEnsemble_AllModels_5_AutoML_2_20241112_222002     186.631  34831.1  144.443  0.219861                   34831.1
StackedEnsemble_AllModels_3_AutoML_2_20241112_222002     186.848  34912.3  144.578  0.219979                   34912.3
StackedEnsemble_Best1000_1_AutoML_2_20241112_222002      186.855  34914.9  144.576  0.219978                   34914.9
StackedEnsemble_BestOfFamily_5_AutoML_2_20241112_222002  186.927  34941.7  144.729  0.220143                   34941.7
StackedEnsemble_BestOfFamily_3_AutoML_2_20241112_222002  186.95   34950.3  144.706  0.2201                     34950.3
StackedEnsemble_AllModels_2_AutoML_2_20241112_222002     186.954  34951.7  144.705  0.220092                   34951.7
GBM_5_AutoML_2_20241112_222002                           186.971  34958    144.781  0.220182                   34958
GBM_grid_1_A

In [40]:
# Evaluate model performance on test data with top 3 features
perf_top3 = aml_top3.leader.model_performance(test_data=test)
print("\nModel Performance on Test Data (Top 3 Features):")
print(perf_top3)


Model Performance on Test Data (Top 3 Features):
ModelMetricsRegressionGLM: stackedensemble
** Reported on test data. **

MSE: 35092.30653892611
RMSE: 187.3294064980886
MAE: 145.59100384998297
RMSLE: 0.21320004931182612
Mean Residual Deviance: 35092.30653892611
R^2: 0.5417136155043435
Null degrees of freedom: 5913
Residual degrees of freedom: 5899
Null deviance: 452858790.6387163
Residual deviance: 207535900.871209
AIC: 78709.5747626964
