# Lab 6: Stacking Regression Models

In [None]:
import pandas as pd
import numpy as np
import os 
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

: 

## Part 1: Load the Data Set

In [None]:
filename = 'data/airbnb_readytofit.csv.gz'
df = pd.read_csv(filename)

: 

## Part 2: Create Training and Test Data Sets

In [None]:
y = df['price']
X = df.drop('price', axis=1)

: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1234)

: 

## Part 3: Use the Stacking Ensemble Method to Train Four Regression Models and Evaluate the Performance

In [None]:
from sklearn.ensemble import StackingRegressor

: 

In [None]:
estimators = [("DT", DecisionTreeRegressor()),
              ("RF", RandomForestRegressor()),
              ("GBDT", GradientBoostingRegressor()),
              ("LR", LinearRegression())
             ]

: 

In [None]:
# YOUR CODE HERE 
stacking_model = StackingRegressor(estimators=estimators, cv=5, passthrough=False)

: 

Let's train and evaluate this ensemble model using cross-validation:

In [None]:
print('Performing Cross-Validation...')


rmse_scores = -1 * cross_val_score(stacking_model, X, y, cv=3, scoring='neg_root_mean_squared_error')
rmse_avg = rmse_scores.mean()


print('End')
print('average score: {}'.format(rmse_avg))


: 

## Part 4: Improve the Performance of the Ensemble Model

In [None]:
"""
params = {
    "DT__max_depth": [2, 4, 8],
    "GBDT__n_estimators":[100,300]
    
}

stack_grid = GridSearchCV(stacking, params, cv=3, verbose=4, scoring='neg_root_mean_squared_error', refit=True, n_jobs=-1)
stack_grid.fit(X_train, y_train)
print(stack_grid.best_params_)
rf_grid.cv_results_['mean_test_score']


print("best parameters:", rf_grid.best_params_)

rmse_stack_cv = -1*rf_grid.best_score_
print("[STACK] RMSE for the best model is : {:.2f}".format(rmse_stack_cv))

"""

: 

In [None]:
# YOUR CODE HERE
estimators_best = [("DT", DecisionTreeRegressor(max_depth=8)),
                   ("RF", RandomForestRegressor()),
                   ("GBDT", GradientBoostingRegressor(n_estimators=100)),
                   ("LR", LinearRegression())
                  ]

: 

In [None]:
print('Implement Stacking...')

# YOUR CODE HERE
stacking_best_model = StackingRegressor(estimators=estimators_best, cv=5, passthrough=False)
stacking_best_model.fit(X, y)

print('End')

: 

In [None]:
# 1. Use predict() to test use the fitted model to make predictions on the test data
stacking_best_pred = stacking_best_model.predict(X_test)

# 2. Compute the RMSE using mean_squared_error()
rmse = mean_squared_error(y_test, stacking_best_pred, squared=False)

# 3. Compute the R2 score using r2_score()
r2 = r2_score(y_test, stacking_best_pred)

           
print('Root Mean Squared Error: {0}'.format(rmse))
print('R2: {0}'.format(r2))                       

: 

## Part 5: Fit and Evaluate Individual Regression Models

### a. Fit and Evaluate a Linear Regression

In [None]:
# 1. Create the LinearRegression model object below and assign to variable 'lr_model'
lr_model = LinearRegression()

# 2. Fit the model to the training data below
lr_model.fit(X_train, y_train)

# 3.  Call predict() to use the fitted model to make predictions on the test data. Save the results to variable
# 'y_lr_pred'
y_lr_pred = lr_model.predict(X_test)

# 4: Compute the RMSE and R2 (on y_test and y_lr_pred) and save the results to lr_rmse and lr_r2
lr_rmse = mean_squared_error(y_test, y_lr_pred, squared=False)
lr_r2 = r2_score(y_test, y_lr_pred)


print('[LR] Root Mean Squared Error: {0}'.format(lr_rmse))
print('[LR] R2: {0}'.format(lr_r2))

: 

### b. Fit and Evaluate a Decision Tree 


In [None]:
# 1. Create the DecisionTreeRegressor model object using the hyperparameter values above and assign to 
# variable 'dt_model'
dt_model = DecisionTreeRegressor(max_depth=8, min_samples_leaf=50)

# 2. Fit the model to the training data below
dt_model.fit(X_train, y_train)

# 3.  Call predict() to use the fitted model to make predictions on the test data. Save the results to variable
# 'y_dt_pred'
y_dt_pred = dt_model.predict(X_test)

# 4: Compute the RMSE and R2 (on y_test and y_dt_pred) and save the results to dt_rmse and dt_r2
dt_rmse = mean_squared_error(y_test, y_dt_pred, squared=False)
dt_r2 = r2_score(y_test, y_dt_pred)


print('[DT] Root Mean Squared Error: {0}'.format(dt_rmse))
print('[DT] R2: {0}'.format(dt_r2))


: 

### c. Fit and Evaluate a Gradient Boosted Decision Tree 

In [None]:
print('Begin GBDT Implementation...')

# 1. Create the  GradientBoostingRegressor model object below and assign to variable 'gbdt_model'
gbdt_model = GradientBoostingRegressor(max_depth=2, n_estimators=300)

# 2. Fit the model to the training data below
gbdt_model.fit(X_train, y_train)

# 3. Call predict() to use the fitted model to make predictions on the test data. Save the results to variable
# 'y_gbdt_pred'
y_gbdt_pred = gbdt_model.predict(X_test)

# 4. Compute the RMSE and R2 (on y_test and y_gbdt_pred) and save the results to gbdt_rmse and gbdt_r2
gbdt_rmse = mean_squared_error(y_test, y_gbdt_pred, squared=False)
gbdt_r2 = r2_score(y_test, y_gbdt_pred)


print('End')

print('[GBDT] Root Mean Squared Error: {0}'.format(gbdt_rmse))
print('[GBDT] R2: {0}'.format(gbdt_r2))                 




: 

### d. Fit and Evaluate  a Random Forest

In [None]:
print('Begin RF Implementation...')

# 1. Create the  RandomForestRegressor model object below and assign to variable 'rf_model'
rf_model = RandomForestRegressor(max_depth=32, n_estimators=300)


# 2. Fit the model to the training data below
rf_model.fit(X_train, y_train)

# 3. Call predict() to use the fitted model to make predictions on the test data. Save the results to variable
# 'y_rf_pred'
y_rf_pred = rf_model.predict(X_test)

# 4. Compute the RMSE and R2 (on y_test and y_rf_pred) and save the results to rf_rmse and rf_r2
rf_rmse = mean_squared_error(y_test, y_rf_pred, squared=False)
rf_r2 = r2_score(y_test, y_rf_pred)


print('End')

print('[RF] Root Mean Squared Error: {0}'.format(rf_rmse))
print('[RF] R2: {0}'.format(rf_r2))


: 

## Part 6: Visualize Model Performance

In [None]:
RMSE_Results = [rmse, lr_rmse, dt_rmse, gbdt_rmse, rf_rmse]
R2_Results = [r2, lr_r2, dt_r2, gbdt_r2, rf_r2]

rg= np.arange(5)
width = 0.35

# 1. Create bar plot with RMSE results
plt.figure(figsize = (10, 5))
plt.bar(rg, RMSE_Results, width, label='RMSE')

# 2. Create bar plot with R2 results
plt.bar(rg + width, R2_Results, width, label='R2')

# 3. Call plt.xticks() to add labels under the bars indicating which model the pair of RMSE 
# and R2 bars correspond to
plt.xticks(rg + width, ['Stacked Ensemble', 'Linear Regression', 'Decision Tree', 'Gradient Boosting', 'Random Forest'])

# 4. Label the x and y axis of the plot: the x axis should be labeled "Models" and the y axis
# should be labeled "RMSE and R2"
plt.xlabel('Models')
plt.ylabel('RMSE and R2')

plt.ylim([0,1])
plt.title('Model Performance')
plt.legend(loc='upper left', ncol=2)
plt.show()


: 

The stacked ensemble model performed significantly better than the individual models, having the 
lowest RMSE and highest R2 scores, which indicates the best performance. Moving forward, to improve the regression
of price we could potentially look into additional and alternative types of models, as well as
testing out different hyperparameter combinations to find the ideal ones. 
We could also potentially experiment with different values for the weight of each individual model when they're all
stacked. 