# Stacking Models

In [14]:
# Import necessary packages
import numpy as np # Numerical computation package
import pandas as pd # Dataframe package
import matplotlib.pyplot as plt # Plotting package
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor # Random Forest estimator


from sklearn.metrics import mean_absolute_error # Metrics
from sklearn.model_selection import RandomizedSearchCV 
from sklearn.model_selection import GridSearchCV # Grid search function
from PIL import Image
from sklearn.linear_model import LinearRegression # OLS


np.random.seed(1) # Set the random seed for reproduceability


from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone


In [4]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    #define clones of the original models to fit the data in
    def fit(self, Xtrain_subset, y_train_subset):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(Xtrain_subset, y_train_subset)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, Xtest_subset):
        predictions = np.column_stack([
            model.predict(Xtest_subset) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [40]:
# Load the data and clean the Columns
ped = pd.read_csv("data_clean_with_dummies.csv")
ped = ped.drop(["time", "Unnamed: 0"], axis = 1)

ped_subset = ped.sample(frac=0.1, random_state=1)

# Define a new X with the squared feature k = 0.99, 57+46
X_subset = ped_subset[ped_subset.columns[57:103]]

# Output to predict
y_subset = ped_subset["pedestrians count"]

# Split the dataset into train and test sets
Xtrain_subset, Xtest_subset, ytrain_subset, ytest_subset = train_test_split(
    X_subset, y_subset, test_size=0.3, random_state=72)
## Hyperparametertuning
### Establish baseline
# Initiate Random Forest with standard parameters as a basline
forest_baseline = RandomForestRegressor()
forest_baseline.fit(Xtrain_subset, ytrain_subset)

# Measure the MAE and the Score for the test and the trining data
rf_mae_train_baseline_subset = mean_absolute_error(ytrain_subset, forest_baseline.predict(Xtrain_subset))


rf_mae_test_baseline_subset = mean_absolute_error(ytest_subset, forest_baseline.predict(Xtest_subset))
rf_score_train_baseline_subset = forest_baseline.score(Xtrain_subset, ytrain_subset)
rf_score_test_basline_subset = forest_baseline.score(Xtest_subset, ytest_subset)


In [21]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
# import xgboost as xgb
# import lightgbm as lgb