### Load libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.set_option('display.max_colwidth', 10000)
pd.set_option('display.max_rows', 1000)

In [2]:
from sklearn import datasets
from sklearn import linear_model, ensemble, tree
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import datetime

### Load full dataset

In [3]:
df = pd.read_csv("../data/train_data_with_lag.csv")

In [4]:
df['Year'].unique()

array([2016, 2017, 2018, 2019])

### Identify numerical vs. categorical variables

In [5]:
NUM_VARS = ['prev_year_no_of_sales', 'prev_year_avg_age',
       'prev_year_price_p_sf', 'prev_year_price_p_house', 'prev_year_avg_sf',
       'Total Population', 'Median Age', 'Median HH Income',
       'Total Housing Units', 'Median Number of Rooms', 'Median Year Built',
       'Median Gross Rent', 'Mean HH Size', 'Percent White', 'Percent Black',
       'Percent HH with Children', 'Percent Housing Vacant',
       'crime_count', 'crimes_per_capita', 'distance_miles']

CAT_VARS = ['pri_neigh', 'sec_neigh', 'side', 'station_name', 'public_schools']

### Identify features and target for model training

In [6]:
FEATURES = ['Norm prev_year_no_of_sales',
 'Norm prev_year_avg_age',
 'Norm prev_year_price_p_sf',
 'Norm prev_year_price_p_house',
 'Norm prev_year_avg_sf',
 'Norm Total Population',
 'Norm Median Age',
 'Norm Median HH Income',
 'Norm Total Housing Units',
 'Norm Median Number of Rooms',
 'Norm Median Year Built',
 'Norm Median Gross Rent',
 'Norm Mean HH Size',
 'Norm Percent White',
 'Norm Percent Black',
 'Norm Percent HH with Children',
 'Norm Percent Housing Vacant',
 'Norm crime_count',
 'Norm crimes_per_capita',
 'Norm distance_miles',
 'pri_neigh_Albany Park',
 'pri_neigh_Andersonville',
 'pri_neigh_Archer Heights',
 'pri_neigh_Armour Square',
 'pri_neigh_Ashburn',
 'pri_neigh_Auburn Gresham',
 'pri_neigh_Austin',
 'pri_neigh_Avalon Park',
 'pri_neigh_Avondale',
 'pri_neigh_Belmont Cragin',
 'pri_neigh_Beverly',
 'pri_neigh_Boystown',
 'pri_neigh_Bridgeport',
 'pri_neigh_Brighton Park',
 'pri_neigh_Bucktown',
 'pri_neigh_Burnside',
 'pri_neigh_Calumet Heights',
 'pri_neigh_Chatham',
 'pri_neigh_Chicago Lawn',
 'pri_neigh_Chinatown',
 'pri_neigh_Clearing',
 'pri_neigh_Douglas',
 'pri_neigh_Dunning',
 'pri_neigh_East Side',
 'pri_neigh_East Village',
 'pri_neigh_Edgewater',
 'pri_neigh_Edison Park',
 'pri_neigh_Englewood',
 'pri_neigh_Fuller Park',
 'pri_neigh_Gage Park',
 'pri_neigh_Galewood',
 'pri_neigh_Garfield Park',
 'pri_neigh_Garfield Ridge',
 'pri_neigh_Gold Coast',
 'pri_neigh_Grand Boulevard',
 'pri_neigh_Grand Crossing',
 'pri_neigh_Greektown',
 'pri_neigh_Hegewisch',
 'pri_neigh_Hermosa',
 'pri_neigh_Humboldt Park',
 'pri_neigh_Hyde Park',
 'pri_neigh_Irving Park',
 'pri_neigh_Jefferson Park',
 'pri_neigh_Kenwood',
 'pri_neigh_Lake View',
 'pri_neigh_Lincoln Park',
 'pri_neigh_Lincoln Square',
 'pri_neigh_Little Italy, UIC',
 'pri_neigh_Little Village',
 'pri_neigh_Logan Square',
 'pri_neigh_Loop',
 'pri_neigh_Lower West Side',
 'pri_neigh_Mckinley Park',
 'pri_neigh_Montclare',
 'pri_neigh_Morgan Park',
 'pri_neigh_Mount Greenwood',
 'pri_neigh_Near South Side',
 'pri_neigh_New City',
 'pri_neigh_North Center',
 'pri_neigh_North Lawndale',
 'pri_neigh_North Park',
 'pri_neigh_Norwood Park',
 "pri_neigh_O'Hare",
 'pri_neigh_Oakland',
 'pri_neigh_Old Town',
 'pri_neigh_Portage Park',
 'pri_neigh_Printers Row',
 'pri_neigh_Pullman',
 'pri_neigh_River North',
 'pri_neigh_Riverdale',
 'pri_neigh_Rogers Park',
 'pri_neigh_Roseland',
 'pri_neigh_Rush & Division',
 'pri_neigh_Sauganash,Forest Glen',
 'pri_neigh_Sheffield & DePaul',
 'pri_neigh_South Chicago',
 'pri_neigh_South Deering',
 'pri_neigh_South Shore',
 'pri_neigh_Streeterville',
 'pri_neigh_Ukrainian Village',
 'pri_neigh_United Center',
 'pri_neigh_Uptown',
 'pri_neigh_Washington Heights',
 'pri_neigh_Washington Park',
 'pri_neigh_West Elsdon',
 'pri_neigh_West Lawn',
 'pri_neigh_West Loop',
 'pri_neigh_West Pullman',
 'pri_neigh_West Ridge',
 'pri_neigh_West Town',
 'pri_neigh_Wicker Park',
 'pri_neigh_Woodlawn',
 'pri_neigh_Wrigleyville',
 'side_Central',
 'side_North',
 'side_South',
 'side_West',
 'public_schools_0.0',
 'public_schools_1.0',
 'public_schools_2.0',
 'public_schools_3.0',
 'public_schools_4.0',
 'public_schools_5.0']

TARGETS = 'price_p_house'

### Create functions for data processing

In [7]:
def process_bool_and_missing(train, test, features):
    
    for f in features:
        if train[f].dtype == 'bool':
            #print(f, "is bool, converting to int")
            train[f] = train[f].astype(int)
            test[f] = test[f].astype(int)
            
        if train[f].dtype in ('float64', 'int64'):   
            #print(f, "training data's mean:", train[f].mean(),
                #"will replace missing values of", f)
            train[f][train[f].isna()] = train[f].mean()
            test[f][test[f].isna()] = train[f].mean()

    return train, test


def normalize_features(train, test, features):
    
    for feature in features:
        scaler = StandardScaler()
        scaler.fit(pd.DataFrame(train.loc[:, feature]))
        n_feature = 'Norm ' + feature
        train[n_feature] = scaler.transform(pd.DataFrame(train.loc[:, feature]))
        test[n_feature] = scaler.transform(pd.DataFrame(test.loc[:, feature]))

    return train, test


def one_hot_encoding_features(train, test, features, prefix):

    train = pd.get_dummies(train, columns = features, prefix = prefix)
    test = pd.get_dummies(test, columns = features, prefix = prefix)
    
    for v in test.columns:
        if v not in train.columns:
            test = test.drop(columns=[v])
        
    for v in train.columns:
            if v not in test.columns:
                test[v] = 0

    return train, test


def prepare_train_test(train, test, num, cat):

    train, test = process_bool_and_missing(train, test, num)
    train, test = normalize_features(train, test, num)
    train, test = one_hot_encoding_features(train, test, cat, cat)

    return train, test


def temporal_train_test_split(df, train_yr, test_yr, num, cat):
    
    train = df.loc[df.Year.isin(train_yr), :]
    test = df.loc[df.Year.isin(test_yr), :]
    #print('TRAIN: ', train_yr, 'TEST: ', test_yr)
    #print('Training size: ', train.shape) 
    #print('Testing size: ', test.shape) 
    train, test = prepare_train_test(train, test, num, cat)
    
    return train, test

### Create functions for building and evaluating regressor

In [8]:
def build_regressors(train, features, targets, model, params):
       
    #print("Training model:", model, "|", params) 
    model.set_params(**params)
    train_features = train[features]
    train_targets = train[targets]
    model.fit(train_features, train_targets)

    return model  


def evaluate_regressors(df, features, targets, model):
    
    actual = df[targets]
    predict = model.predict(df[features])      
    #score = model.score(df[features], df[targets])
    mse = mean_squared_error(actual, predict)
            
    return mse

### Create functions for Cross Validation

In [9]:
def k_fold_CV(train, features, targets, model_class, params, cv=5, scoring='neg_mean_squared_error'):
    
    start = datetime.datetime.now() # Begin timer 
    
    grid_model = GridSearchCV(
                    estimator=model_class, 
                    param_grid=params, 
                    cv=cv,
                    scoring=scoring,
                    return_train_score=True,
                    refit=True)
       
    grid_model_results = grid_model.fit(train[features], train[targets])
    ranked_cv = pd.DataFrame(grid_model.cv_results_).sort_values(by=['rank_test_score'])
    cv_results = ranked_cv[['params','rank_test_score', 'mean_train_score', 'mean_test_score']]
    
    stop = datetime.datetime.now() # End timer
    print("ALL FOLDS Time Elapsed:", stop - start)  
    
    return cv_results, grid_model

### Create functions for Forward Chaining

In [10]:
def forward_chaining(df, features, targets, model_class, params, num, cat):
    
    start = datetime.datetime.now() # Begin timer 
    
    year_list = df['Year'].unique().tolist()
    results = pd.DataFrame()

    # splits the dataframe into train/test set using forward chaining
    for idx, yr in enumerate(year_list[:-1]):
        train_yr = year_list[:idx+1]
        test_yr = [year_list[idx+1]]
        print("TRAIN YEARS: ", train_yr)
        print("TEST YEARS: ", test_yr)
        
        train, test = temporal_train_test_split(df, train_yr, test_yr, num, cat)
        cv_results, grid_model = k_fold_CV(train, features, targets, model_class, params)
        
        cv_results = cv_results.rename(
            columns={"rank_test_score": "rank_by_CV_mean_test_rmse",
                     "mean_train_score": "CV_mean_train_rmse",
                     "mean_test_score": "CV_mean_test_rmse"})
        
        cv_results['CV_mean_train_rmse'] = np.sqrt(-cv_results['CV_mean_train_rmse'])
        cv_results['CV_mean_test_rmse'] = np.sqrt(-cv_results['CV_mean_test_rmse'])
        cv_results['mean_train_rmse'] = 0
        cv_results['mean_test_rmse'] = 0

        cv_results['Train Years'] = str(train_yr)
        cv_results['Test Years'] = str(test_yr)
        
        for i in range(len(cv_results['params'])):
            p = cv_results['params'].iloc[i]
            refit_model = build_regressors(train, features, targets, model_class, p)
            mean_train_mse = evaluate_regressors(train, features, targets, refit_model)
            mean_test_mse = evaluate_regressors(test, features, targets, refit_model)
            
            cv_results['mean_train_rmse'].iloc[i] = np.sqrt(mean_train_mse)
            cv_results['mean_test_rmse'].iloc[i] = np.sqrt(mean_test_mse)
        
        results = results.append(cv_results, ignore_index=True)
    
    stop = datetime.datetime.now() # End timer
    print("ALL TEMPORAL SPLITS Time Elapsed:", stop - start)  
    
    return results

### Configure the models and params to tune

In [12]:
MODELS = {
    'LinearRegression': linear_model.LinearRegression(), 
    'Ridge': linear_model.Ridge(),
    'Lasso': linear_model.Lasso(), 
    'ElasticNet': linear_model.ElasticNet(),
    'DecisionTree': tree.DecisionTreeRegressor(n_jobs=2),
    'RandomForest': ensemble.RandomForestRegressor(n_jobs=2),
    'Boosting': ensemble.GradientBoostingRegressor(n_jobs=2)
}

GRID = {
    'LinearRegression': {},
    
    'Ridge': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},  
    
    'Lasso': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
    
    'ElasticNet': {'max_iter': [10000], 'random_state': [0],
                   'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                   'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]},
    
    'DecisionTree': {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
                     'max_features': ['auto', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10]},
    
    'RandomForest': {'max_depth': [20, 30, 40, 50, 60, 70, 80, 90, 100],
                     'max_features': ['auto', 'sqrt'],
                     'min_samples_split': [2, 5, 10],
                     'n_estimators': [10, 100, 500]},
    
    'Boosting': {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                 "min_samples_split": np.linspace(0.1, 0.5, 3),
                 "min_samples_leaf": np.linspace(0.1, 0.5, 3),
                 "max_depth": [3, 5, 8],
                 "max_features": ['auto', 'sqrt'],
                 "criterion": ['friedman_mse'],
                 "subsample": [0.5, 0.75, 1.0],
                 "n_estimators": [10, 100, 500]}
} 

In [13]:
linear = forward_chaining(df, FEATURES, TARGETS, MODELS['LinearRegression'],
                          GRID['LinearRegression'], NUM_VARS, CAT_VARS)

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:00.152435
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:00:00.275232
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:00:00.419781
ALL TEMPORAL SPLITS Time Elapsed: 0:00:01.986531


In [14]:
linear

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,{},1,115024.277257,9948330000000000.0,116783.935319,112808.322608,[2016],[2017]
1,{},1,111231.800815,1802549000000000.0,112679.638741,123131.266713,"[2016, 2017]",[2018]
2,{},1,113020.164975,3206674000000000.0,114506.559024,105077.473098,"[2016, 2017, 2018]",[2019]


In [15]:
ridge = forward_chaining(df, FEATURES, TARGETS, MODELS['Ridge'],
                          GRID['Ridge'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:00.475987
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:00:00.967181
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:00:00.888180
ALL TEMPORAL SPLITS Time Elapsed: 0:00:03.805627


In [16]:
ridge

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",1,115033.383808,158368.029406,116789.927505,112753.062497,[2016],[2017]
1,"{'alpha': 0.01, 'max_iter': 10000, 'random_state': 0}",2,115024.281769,158532.329745,116783.994517,112802.643696,[2016],[2017]
2,"{'alpha': 0.001, 'max_iter': 10000, 'random_state': 0}",3,115024.183486,158557.098439,116783.932287,112808.25976,[2016],[2017]
3,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",4,115548.613412,159355.598866,117186.34617,112690.27151,[2016],[2017]
4,"{'alpha': 1000, 'max_iter': 10000, 'random_state': 0}",5,140376.611143,163102.678413,141288.51094,131894.812536,[2016],[2017]
5,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",6,121519.549415,163989.536524,123062.870372,116666.543925,[2016],[2017]
6,"{'alpha': 100, 'max_iter': 10000, 'random_state': 0}",7,131365.363808,164025.923901,133561.419623,124643.886085,[2016],[2017]
7,"{'alpha': 10000, 'max_iter': 10000, 'random_state': 0}",8,172607.206771,194023.460898,169687.175526,162285.493488,[2016],[2017]
8,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",1,111393.262363,149290.800688,112795.131296,123116.912019,"[2016, 2017]",[2018]
9,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",2,111234.019496,150066.603348,112681.063031,123121.646003,"[2016, 2017]",[2018]


In [17]:
lasso = forward_chaining(df, FEATURES, TARGETS, MODELS['Lasso'],
                          GRID['Lasso'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:00:42.371118


  positive)
  positive)
  positive)
  positive)


TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:02:32.664836


  positive)
  positive)
  positive)
  positive)


TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:04:24.104127


  positive)
  positive)
  positive)


ALL TEMPORAL SPLITS Time Elapsed: 0:09:36.250701


  positive)


In [18]:
lasso

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",1,115039.766047,160067.075297,116800.757894,112706.360651,[2016],[2017]
1,"{'alpha': 0.1, 'max_iter': 10000, 'random_state': 0}",2,115024.183926,160226.139334,116783.933156,112807.745798,[2016],[2017]
2,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",3,115024.343786,160400.390972,116784.10971,112797.086028,[2016],[2017]
3,"{'alpha': 100, 'max_iter': 10000, 'random_state': 0}",4,116198.216446,163196.153057,118034.186701,113181.591764,[2016],[2017]
4,"{'alpha': 1000, 'max_iter': 10000, 'random_state': 0}",5,131025.124897,167647.484731,135141.482969,125472.888948,[2016],[2017]
5,"{'alpha': 10000, 'max_iter': 10000, 'random_state': 0}",6,139682.875423,170629.857828,141960.23277,129267.883935,[2016],[2017]
6,"{'alpha': 0.01, 'max_iter': 10000, 'random_state': 0}",7,115024.182498,171230.608854,116783.931669,112808.788419,[2016],[2017]
7,"{'alpha': 0.001, 'max_iter': 10000, 'random_state': 0}",8,115024.182485,173328.473626,116783.931655,112808.88131,[2016],[2017]
8,"{'alpha': 10, 'max_iter': 10000, 'random_state': 0}",1,111249.224008,149220.896484,112698.037602,123069.118635,"[2016, 2017]",[2018]
9,"{'alpha': 1, 'max_iter': 10000, 'random_state': 0}",2,111231.972831,149609.457465,112679.807471,123124.034389,"[2016, 2017]",[2018]


In [19]:
enet = forward_chaining(df, FEATURES, TARGETS, MODELS['ElasticNet'],
                          GRID['ElasticNet'], NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:01:22.467411


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:05:20.317856


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL FOLDS Time Elapsed: 0:09:47.085610


  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)
  positive)


ALL TEMPORAL SPLITS Time Elapsed: 0:20:49.534156


  positive)


In [20]:
enet

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",1,115039.766047,160067.075297,116800.757894,112706.360651,[2016],[2017]
1,"{'alpha': 0.1, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",2,115024.183926,160226.139334,116783.933156,112807.745798,[2016],[2017]
2,"{'alpha': 1, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",3,115024.343786,160400.390972,116784.10971,112797.086028,[2016],[2017]
3,"{'alpha': 0.01, 'l1_ratio': 0.8, 'max_iter': 10000, 'random_state': 0}",4,117675.658972,162065.393871,119794.012437,114272.895657,[2016],[2017]
4,"{'alpha': 1, 'l1_ratio': 0.6, 'max_iter': 10000, 'random_state': 0}",5,138308.845688,162288.909729,140664.050956,131195.205015,[2016],[2017]
5,"{'alpha': 1, 'l1_ratio': 0.8, 'max_iter': 10000, 'random_state': 0}",6,135362.116949,162306.512523,137981.849602,128329.122862,[2016],[2017]
6,"{'alpha': 100, 'l1_ratio': 1, 'max_iter': 10000, 'random_state': 0}",7,116198.216446,163196.153057,118034.186701,113181.591764,[2016],[2017]
7,"{'alpha': 0.1, 'l1_ratio': 0, 'max_iter': 10000, 'random_state': 0}",8,133161.837694,163252.516293,135976.450572,126519.694398,[2016],[2017]
8,"{'alpha': 1, 'l1_ratio': 0.4, 'max_iter': 10000, 'random_state': 0}",9,140779.824435,163316.322313,142948.501838,133765.827416,[2016],[2017]
9,"{'alpha': 0.01, 'l1_ratio': 0.6, 'max_iter': 10000, 'random_state': 0}",10,120050.982549,163466.914368,122494.804959,116241.459832,[2016],[2017]


In [21]:
dtree = forward_chaining(df, FEATURES, TARGETS, MODELS['DecisionTree'],
                          GRID['DecisionTree'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 0:00:24.844679
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 0:01:03.887140
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 0:01:14.902705
ALL TEMPORAL SPLITS Time Elapsed: 0:03:24.146212


In [22]:
dtree

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",1,82513.021878,136000.875557,82011.229783,125215.155984,[2016],[2017]
1,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 10}",2,67586.205876,136008.685863,69738.271905,121060.095892,[2016],[2017]
2,"{'max_depth': 90, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",3,47398.436391,136633.93274,46424.919785,140926.32899,[2016],[2017]
3,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",4,47398.436391,137431.342365,46424.919785,140266.259512,[2016],[2017]
4,"{'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",5,47398.436391,137971.539376,46424.919785,139237.653966,[2016],[2017]
5,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",6,47398.436391,138085.280929,46424.919785,139832.522043,[2016],[2017]
6,"{'max_depth': 60, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",7,79316.537118,138105.831359,77787.164512,128037.089952,[2016],[2017]
7,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10}",8,47398.436391,138143.092319,46424.919785,139882.957454,[2016],[2017]
8,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",9,79316.537118,138195.834401,77787.164512,127425.300689,[2016],[2017]
9,"{'max_depth': 50, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",10,79316.537118,138200.213971,77787.164512,128005.62848,[2016],[2017]


In [23]:
rf = forward_chaining(df, FEATURES, TARGETS, MODELS['RandomForest'],
                          GRID['RandomForest'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 1:18:55.042341
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 1:38:35.816736
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]
ALL FOLDS Time Elapsed: 7:50:16.643448
ALL TEMPORAL SPLITS Time Elapsed: 12:05:37.844443


In [24]:
rf

Unnamed: 0,params,rank_by_CV_mean_test_rmse,CV_mean_train_rmse,CV_mean_test_rmse,mean_train_rmse,mean_test_rmse,Train Years,Test Years
0,"{'max_depth': 50, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 10}",1,58752.02989,127008.349669,57524.723507,112585.972181,[2016],[2017]
1,"{'max_depth': 20, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",2,44041.902449,127665.602127,43427.981254,104554.470152,[2016],[2017]
2,"{'max_depth': 90, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 10}",3,59612.486757,127746.430062,61183.415099,102516.136374,[2016],[2017]
3,"{'max_depth': 90, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500}",4,43127.324012,128123.629555,42026.868409,104256.058537,[2016],[2017]
4,"{'max_depth': 80, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",5,43229.531541,128154.491162,43911.694293,104082.710451,[2016],[2017]
5,"{'max_depth': 30, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500}",6,43043.759922,128161.657638,43034.504743,104600.713121,[2016],[2017]
6,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_split': 5, 'n_estimators': 100}",7,52664.443199,128237.753578,52144.727135,104818.162087,[2016],[2017]
7,"{'max_depth': 30, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",8,43357.767153,128282.620947,42390.119513,104062.332791,[2016],[2017]
8,"{'max_depth': 100, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 100}",9,44437.210332,128314.627669,42822.018062,105066.934137,[2016],[2017]
9,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_split': 2, 'n_estimators': 500}",10,43010.922399,128348.306728,42896.801813,103231.367972,[2016],[2017]


In [None]:
boost = forward_chaining(df, FEATURES, TARGETS, MODELS['Boosting'],
                          GRID['Boosting'], NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2016]
TEST YEARS:  [2017]
ALL FOLDS Time Elapsed: 1:56:12.811048
TRAIN YEARS:  [2016, 2017]
TEST YEARS:  [2018]
ALL FOLDS Time Elapsed: 3:46:10.218008
TRAIN YEARS:  [2016, 2017, 2018]
TEST YEARS:  [2019]


In [None]:
boost