# Regression Analysis
In this notebook, we perform regression analysis on the cleaned dataset with the following regressors:

1. Random Forest Regressor
2. Gradient Boosting Regressor
3. Extra Trees Regressor

In [1]:
# Load packages
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Download cleansed data from Preprocessing/EDA stored in Google Drive
google_drive = False
if google_drive:
  from google.colab import drive
  drive.mount('/content/drive')
  data_dir = '/content/drive/MyDrive/'
  train_data = pd.read_csv('/content/drive/MyDrive/train_cleaned.csv')
  test_data = pd.read_csv('/content/drive/MyDrive/test_cleaned.csv')

# Download cleansed data from pulled GitHub Repository
else:
  train_data = pd.read_csv('/data/train_cleaned.csv')
  test_data = pd.read_csv('/data/test_cleaned.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_data.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog,site_eui
0,1,0,0,13,61242.0,1942.0,11.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,109.142051,248.682615
1,1,0,0,55,274000.0,1955.0,45.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,26.50015
2,1,0,0,48,280025.0,1951.0,97.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,24.693619
3,1,0,0,6,55325.0,1980.0,46.0,1.8,36,50.5,...,0,14,0,0,0,1.0,62.779974,1.0,12.0,48.406926
4,1,0,0,56,66000.0,1985.0,100.0,2.4,36,50.5,...,0,14,0,0,0,1.0,1.0,1.0,109.142051,3.899395


In [4]:
test_data.head()

Unnamed: 0,Year_Factor,State_Factor,building_class,facility_type,floor_area,year_built,energy_star_rating,ELEVATION,january_min_temp,january_avg_temp,...,days_below_10F,days_below_0F,days_above_80F,days_above_90F,days_above_100F,days_above_110F,direction_max_wind_speed,direction_peak_wind_speed,max_wind_speed,days_with_fog
0,7,0,0,13,28484.0,1994.0,37.0,2.4,38,50.596774,...,0,0,29,5,2,0,321.743363,310.19469,18.131327,150.755102
1,7,0,0,13,21906.0,1961.0,55.0,45.7,38,50.596774,...,0,0,29,5,2,0,321.743363,310.19469,18.131327,150.755102
2,7,0,0,13,16138.0,1950.0,1.0,59.1,38,50.596774,...,0,0,29,5,2,0,321.743363,310.19469,18.131327,150.755102
3,7,0,0,13,97422.0,1971.0,34.0,35.4,38,50.596774,...,0,0,29,5,2,0,321.743363,310.19469,18.131327,150.755102
4,7,0,0,13,61242.0,1942.0,35.0,1.8,38,50.596774,...,0,0,29,5,2,0,340.0,330.0,22.8,126.0


In [5]:
train_data.columns.values

array(['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION',
       'january_min_temp', 'january_avg_temp', 'january_max_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_min_temp', 'march_avg_temp', 'march_max_temp',
       'april_min_temp', 'april_avg_temp', 'april_max_temp',
       'may_min_temp', 'may_avg_temp', 'may_max_temp', 'june_min_temp',
       'june_avg_temp', 'june_max_temp', 'july_min_temp', 'july_avg_temp',
       'july_max_temp', 'august_min_temp', 'august_avg_temp',
       'august_max_temp', 'september_min_temp', 'september_avg_temp',
       'september_max_temp', 'october_min_temp', 'october_avg_temp',
       'october_max_temp', 'november_min_temp', 'november_avg_temp',
       'november_max_temp', 'december_min_temp', 'december_avg_temp',
       'december_max_temp', 'cooling_degree_days', 'heating_degree_days',
       'precipitation_inches', 'snowf

In [6]:
# Split dataset into Training and Testing Data: X = features, Y = target value
features = ['Year_Factor', 'State_Factor', 'building_class', 'facility_type',
       'floor_area', 'year_built', 'energy_star_rating', 'ELEVATION',
       'january_min_temp', 'january_avg_temp', 'january_max_temp',
       'february_min_temp', 'february_avg_temp', 'february_max_temp',
       'march_min_temp', 'march_avg_temp', 'march_max_temp',
       'april_min_temp', 'april_avg_temp', 'april_max_temp',
       'may_min_temp', 'may_avg_temp', 'may_max_temp', 'june_min_temp',
       'june_avg_temp', 'june_max_temp', 'july_min_temp', 'july_avg_temp',
       'july_max_temp', 'august_min_temp', 'august_avg_temp',
       'august_max_temp', 'september_min_temp', 'september_avg_temp',
       'september_max_temp', 'october_min_temp', 'october_avg_temp',
       'october_max_temp', 'november_min_temp', 'november_avg_temp',
       'november_max_temp', 'december_min_temp', 'december_avg_temp',
       'december_max_temp', 'cooling_degree_days', 'heating_degree_days',
       'precipitation_inches', 'snowfall_inches', 'snowdepth_inches',
       'avg_temp', 'days_below_30F', 'days_below_20F', 'days_below_10F',
       'days_below_0F', 'days_above_80F', 'days_above_90F',
       'days_above_100F', 'days_above_110F', 'direction_max_wind_speed',
       'direction_peak_wind_speed', 'max_wind_speed', 'days_with_fog']
X_train,X_test,y_train,y_test = train_test_split(train_data[features], train_data[['site_eui']], test_size=0.25, random_state=10 )

In [7]:
def evaluate_model(model, x, y):
  '''
  Function compares true and predicted values
  and returns rmse and r2 as a measure of accuracy
  '''
  y_pred = model.predict(x)
  rmse = mean_squared_error(y, y_pred, squared=False )
  print("RMSE:", rmse )
  r2_val = r2_score(y, y_pred)
  print("R2:", r2_val)

In [8]:
def generate_predictions(model,x):
  '''
  Function returns predictions based on 
  feature np array
  '''
  y_pred = model.predict(x)
  return y_pred

In [9]:
# Perform regression on RandomForestRegressor base model
model = RandomForestRegressor()
model.fit(X_train, y_train['site_eui'].ravel())

# Evaluate Results
evaluate_model(model, X_train, y_train)
evaluate_model(model, X_test, y_test)
pred = generate_predictions(model,test_data)
pred

RMSE: 15.348558791285129
R2: 0.9297837539183385
RMSE: 43.387206319460056
R2: 0.46360774361401114


array([277.02464457, 209.6072686 , 449.1000774 , ...,  57.6763206 ,
        49.07286115,  39.25794467])

In [10]:
# Perform regression on GradientBoostingRegressor base model
model = GradientBoostingRegressor()
model.fit(X_train, y_train['site_eui'].ravel())

# Evaluate Results
evaluate_model(model, X_train, y_train)
evaluate_model(model, X_test, y_test)
pred = generate_predictions(model,test_data)
pred

RMSE: 43.45748195242993
R2: 0.4370996480295748
RMSE: 46.61641405034483
R2: 0.38079158383896217


array([247.45646213, 184.72831395, 394.66629399, ...,  30.99648502,
        31.29315945,  33.4399374 ])

In [11]:
# Perform regression on ExtraTreesRegressor base model
model = ExtraTreesRegressor()
model.fit(X_train, y_train['site_eui'].ravel())

# Evaluate Results
evaluate_model(model, X_train, y_train)
evaluate_model(model, X_test, y_test)
pred = generate_predictions(model,test_data)
pred

RMSE: 1.3595870772617937
R2: 0.9994490445806484
RMSE: 47.23548792399181
R2: 0.3642359931738077


array([253.85717984, 240.00331588, 409.73803085, ...,  48.18022196,
        52.85069667,  39.63801282])

## Results

The Extra Trees Regressor had the best performance of all models with an RMSE 1.35 and R2 of 0.99