In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
import pandas as pd
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
pd.set_option('display.max_colwidth', 10000)

In [2]:
from sklearn import datasets
from sklearn import linear_model, ensemble, tree
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
import datetime

In [3]:
df = pd.read_csv("clean_data.csv")

In [4]:
NUM_VARS = ['prev_year_no_of_sales', 'prev_year_avg_age',
       'prev_year_price_p_sf', 'prev_year_price_p_house', 'prev_year_avg_sf',
       'Total Population', 'Median Age', 'Median HH Income',
       'Total Housing Units', 'Median Number of Rooms', 'Median Year Built',
       'Median Gross Rent', 'Mean HH Size', 'Percent White', 'Percent Black',
       'Percent HH with Children', 'Percent Housing Vacant',
       'crime_count', 'crimes_per_capita', 'distance_miles']

CAT_VARS = ['pri_neigh', 'sec_neigh', 'side', 'station_name', 'public_schools']

FEATURES = ['Norm prev_year_no_of_sales',
 'Norm prev_year_avg_age',
 'Norm prev_year_price_p_sf',
 'Norm prev_year_price_p_house',
 'Norm prev_year_avg_sf',
 'Norm Total Population',
 'Norm Median Age',
 'Norm Median HH Income',
 'Norm Total Housing Units',
 'Norm Median Number of Rooms',
 'Norm Median Year Built',
 'Norm Median Gross Rent',
 'Norm Mean HH Size',
 'Norm Percent White',
 'Norm Percent Black',
 'Norm Percent HH with Children',
 'Norm Percent Housing Vacant',
 'Norm crime_count',
 'Norm crimes_per_capita',
 'Norm distance_miles',
 'pri_neigh_Albany Park',
 'pri_neigh_Andersonville',
 'pri_neigh_Archer Heights',
 'pri_neigh_Armour Square',
 'pri_neigh_Ashburn',
 'pri_neigh_Auburn Gresham',
 'pri_neigh_Austin',
 'pri_neigh_Avalon Park',
 'pri_neigh_Avondale',
 'pri_neigh_Belmont Cragin',
 'pri_neigh_Beverly',
 'pri_neigh_Boystown',
 'pri_neigh_Bridgeport',
 'pri_neigh_Brighton Park',
 'pri_neigh_Bucktown',
 'pri_neigh_Burnside',
 'pri_neigh_Calumet Heights',
 'pri_neigh_Chatham',
 'pri_neigh_Chicago Lawn',
 'pri_neigh_Chinatown',
 'pri_neigh_Clearing',
 'pri_neigh_Douglas',
 'pri_neigh_Dunning',
 'pri_neigh_East Side',
 'pri_neigh_East Village',
 'pri_neigh_Edgewater',
 'pri_neigh_Edison Park',
 'pri_neigh_Englewood',
 'pri_neigh_Fuller Park',
 'pri_neigh_Gage Park',
 'pri_neigh_Galewood',
 'pri_neigh_Garfield Park',
 'pri_neigh_Garfield Ridge',
 'pri_neigh_Gold Coast',
 'pri_neigh_Grand Boulevard',
 'pri_neigh_Grand Crossing',
 'pri_neigh_Greektown',
 'pri_neigh_Hegewisch',
 'pri_neigh_Hermosa',
 'pri_neigh_Humboldt Park',
 'pri_neigh_Hyde Park',
 'pri_neigh_Irving Park',
 'pri_neigh_Jefferson Park',
 'pri_neigh_Kenwood',
 'pri_neigh_Lake View',
 'pri_neigh_Lincoln Park',
 'pri_neigh_Lincoln Square',
 'pri_neigh_Little Italy, UIC',
 'pri_neigh_Little Village',
 'pri_neigh_Logan Square',
 'pri_neigh_Loop',
 'pri_neigh_Lower West Side',
 'pri_neigh_Mckinley Park',
 'pri_neigh_Montclare',
 'pri_neigh_Morgan Park',
 'pri_neigh_Mount Greenwood',
 'pri_neigh_Near South Side',
 'pri_neigh_New City',
 'pri_neigh_North Center',
 'pri_neigh_North Lawndale',
 'pri_neigh_North Park',
 'pri_neigh_Norwood Park',
 "pri_neigh_O'Hare",
 'pri_neigh_Oakland',
 'pri_neigh_Old Town',
 'pri_neigh_Portage Park',
 'pri_neigh_Printers Row',
 'pri_neigh_Pullman',
 'pri_neigh_River North',
 'pri_neigh_Riverdale',
 'pri_neigh_Rogers Park',
 'pri_neigh_Roseland',
 'pri_neigh_Rush & Division',
 'pri_neigh_Sauganash,Forest Glen',
 'pri_neigh_Sheffield & DePaul',
 'pri_neigh_South Chicago',
 'pri_neigh_South Deering',
 'pri_neigh_South Shore',
 'pri_neigh_Streeterville',
 'pri_neigh_Ukrainian Village',
 'pri_neigh_United Center',
 'pri_neigh_Uptown',
 'pri_neigh_Washington Heights',
 'pri_neigh_Washington Park',
 'pri_neigh_West Elsdon',
 'pri_neigh_West Lawn',
 'pri_neigh_West Loop',
 'pri_neigh_West Pullman',
 'pri_neigh_West Ridge',
 'pri_neigh_West Town',
 'pri_neigh_Wicker Park',
 'pri_neigh_Woodlawn',
 'pri_neigh_Wrigleyville',
 'side_Central',
 'side_North',
 'side_South',
 'side_West',
 'station_name_18th',
 'station_name_35th-Bronzeville-IIT',
 'station_name_35th/Archer',
 'station_name_43rd',
 'station_name_47th',
 'station_name_51st',
 'station_name_63rd',
 'station_name_69th',
 'station_name_79th',
 'station_name_87th',
 'station_name_95th/Dan Ryan',
 'station_name_Addison',
 'station_name_Argyle',
 'station_name_Armitage',
 'station_name_Ashland',
 'station_name_Ashland/63rd',
 'station_name_Austin',
 'station_name_Belmont',
 'station_name_Berwyn',
 'station_name_Bryn Mawr',
 'station_name_California',
 'station_name_Central',
 'station_name_Central Park',
 'station_name_Cermak-Chinatown',
 'station_name_Cermak-McCormick Place',
 'station_name_Chicago',
 'station_name_Cicero',
 'station_name_Clark/Division',
 'station_name_Clinton',
 'station_name_Conservatory',
 'station_name_Cottage Grove',
 'station_name_Cumberland',
 'station_name_Damen',
 'station_name_Diversey',
 'station_name_Division',
 'station_name_Francisco',
 'station_name_Fullerton',
 'station_name_Garfield',
 'station_name_Grand',
 'station_name_Granville',
 'station_name_Halsted',
 'station_name_Harlem',
 'station_name_Harlem/Lake',
 'station_name_Harrison',
 'station_name_Howard',
 'station_name_Illinois Medical District',
 'station_name_Indiana',
 'station_name_Irving Park',
 'station_name_Jarvis',
 'station_name_Jefferson Park',
 'station_name_Kedzie',
 'station_name_Kedzie-Homan',
 'station_name_Kimball',
 'station_name_King Drive',
 'station_name_Kostner',
 'station_name_LaSalle',
 'station_name_Laramie',
 'station_name_Lawrence',
 'station_name_Logan Square',
 'station_name_Loyola',
 'station_name_Merchandise Mart',
 'station_name_Midway',
 'station_name_Monroe',
 'station_name_Montrose',
 'station_name_Morgan',
 'station_name_Morse',
 'station_name_North/Clybourn',
 'station_name_Oak Park',
 'station_name_Oakton-Skokie',
 'station_name_Paulina',
 'station_name_Polk',
 'station_name_Pulaski',
 'station_name_Quincy/Wells',
 'station_name_Racine',
 'station_name_Ridgeland',
 'station_name_Rockwell',
 'station_name_Roosevelt',
 'station_name_Sedgwick',
 'station_name_Sheridan',
 'station_name_South Boulevard',
 'station_name_Southport',
 'station_name_Sox-35th',
 'station_name_State/Lake',
 'station_name_Thorndale',
 'station_name_UIC-Halsted',
 'station_name_Washington/Wabash',
 'station_name_Wellington',
 'station_name_Western',
 'station_name_Wilson',
 'public_schools_0.0',
 'public_schools_1.0',
 'public_schools_2.0',
 'public_schools_3.0',
 'public_schools_4.0',
 'public_schools_5.0']

In [5]:
def process_bool_and_missing(train_df, test_df, features):
    '''
    Purpose: apply filters to numeric features in the df

    Inputs:
        df (dataframe)
        filter_info (dict): of the form {'column_name': ['value1', 'value2']}

    Returns: (dataframe) filtered dataframe,
      or None if a specified column does not exist
    '''
    
    train = train_df
    test = test_df
    
    for f in features:
        if train[f].dtype == 'bool':
            #print(f, "is bool, converting to int")
            train[f] = train[f].astype(int)
            test[f] = test[f].astype(int)
            
        if train[f].dtype in ('float64', 'int64'):   
            #print(f, "training data's mean:", train[f].mean(),
                #"will replace missing values of", f)
            train[f][train[f].isna()] = train[f].mean()
            test[f][test[f].isna()] = train[f].mean()

    return train, test


def normalize_features(train_df, test_df, features):
    '''
    Purpose: normalize the set of features listed, using training set
    mean and standard deviation

    Inputs:
    train, test (df): train and test sets

    Returns: modify the existing train and test sets with new normalized 
    variables as new variables
    '''
    
    train = train_df
    test = test_df
    
    for feature in features:
        scaler = StandardScaler()
        scaler.fit(pd.DataFrame(train.loc[:, feature]))
        n_feature = 'Norm ' + feature
        train[n_feature] = scaler.transform(pd.DataFrame(train.loc[:, feature]))
        test[n_feature] = scaler.transform(pd.DataFrame(test.loc[:, feature]))

    return train, test


def one_hot_encoding_features(train_df, test_df, features, prefix):
    '''
    Purpose: Encode categorical variables

    Inputs:
    train, test (df): train and test sets
    features (list): list of features to encode

    Returns: modify the existing train and test sets

    '''

    train = pd.get_dummies(train_df, columns = features, prefix = prefix)
    test = pd.get_dummies(test_df, columns = features, prefix = prefix)
    
    for v in test.columns:
        if v not in train.columns:
            test = test.drop(columns=[v])
        
    for v in train.columns:
            if v not in test.columns:
                test[v] = 0

    return train, test


def prepare_train_test(train_df, test_df, num, cat):
    train, test = process_bool_and_missing(train_df, test_df, num)
    train, test = normalize_features(train, test, num)
    train, test = one_hot_encoding_features(train, test, cat, cat)

    return train, test


def temporal_train_test_split(df, train_yr, test_yr, num, cat):
    
    train = df.loc[df.Year.isin(train_yr), :]
    test = df.loc[df.Year.isin(test_yr), :]
    #print('TRAIN: ', train_yr, 'TEST: ', test_yr)
    #print('Training size: ', train.shape) 
    #print('Testing size: ', test.shape) 
    
    train, test = prepare_train_test(train, test, num, cat)
    
    return train, test

In [6]:
### Build Classifiers
def build_classifiers(df, features, targets, model, params):
    '''
    Purpose: Apply machine learning model to training data

    Inputs:
    train_features (df): dataframe of training features data
    train_targets (array): 1d array of training target data
    model: class of model to fit
    params: params of the model

    Returns: model object
    '''
    
    # Begin timer 
    start = datetime.datetime.now()
       
    # Create model 
    print("Training model:", model, "|", params) 
    model.set_params(**params)
            
    # Fit model on training set 
    train_features = df[features]
    train_targets = df[targets]
    model.fit(train_features, train_targets)
                
    # End timer
    stop = datetime.datetime.now()
    print("1 Fold Time Elapsed:", stop - start)  

    return model  


### Evaluate Classifiers
def evaluate_classifiers(df, features, targets, model):
    '''
    Purpose: Evaluate a built model on some data using 
    sklearn built in mean accuracy score

    Inputs:
    features (df): dataframe of features data
    targets (array): 1d array of target data
    model: model object built from previous steps

    Returns: (float) mean accuracy score
    '''    

    # Predict on features 
    model.predict(df[features])
            
    # Evaluate predictions 
    score = model.score(df[features], df[targets])
            
    return score

In [7]:
### K-Fold Cross validation 
def k_fold_CV(train_df, features, targets, model_class, params, cv=5, scoring='r2'):
    # Begin timer 
    start = datetime.datetime.now()
    
    grid_model = GridSearchCV(
                    estimator=model_class, 
                    param_grid=params, 
                    cv=cv,
                    scoring=scoring,
                    return_train_score=True,
                    refit=True)
       
    grid_model_results = grid_model.fit(train_df[features], train_df[targets])
    ranked_cv = pd.DataFrame(grid_model.cv_results_).sort_values(by=['rank_test_score'])
    cv_results = ranked_cv[['params','rank_test_score', 'mean_train_score', 'mean_test_score']]
    
    # End timer
    stop = datetime.datetime.now()
    print("ALL FOLDS Time Elapsed:", stop - start)  
    
    return cv_results, grid_model

In [8]:
### Loop through multiple years
def forward_chaining(df, features, targets, model_class, params, num, cat):
    # Begin timer 
    start = datetime.datetime.now()
    
    year_list = df['Year'].unique().tolist()
    results = pd.DataFrame()

    # splits the dataframe into train/test set using forward chaining
    for idx, yr in enumerate(year_list[:-1]):
        train_yr = year_list[:idx+1]
        test_yr = [year_list[idx+1]]
        print("TRAIN YEARS: ", train_yr)
        
        train, test = temporal_train_test_split(df, train_yr, test_yr, num, cat)

        cv_results, grid_model = k_fold_CV(train, features, targets, model_class, params)
        
        cv_results = cv_results[cv_results['rank_test_score'] <= 1].rename(
            columns={"mean_test_score": "mean_test_CV_score", "mean_train_score": "mean_train_CV_score"})

        train_mean_score = evaluate_classifiers(train, features, targets, grid_model)
        test_mean_score = evaluate_classifiers(test, features, targets, grid_model)

        cv_results['Model'] = model_class
        cv_results['Train Years'] = str(train_yr)
        cv_results['Test Years'] = str(test_yr)
        cv_results['train_mean_score'] = train_mean_score
        cv_results['test_mean_score'] = test_mean_score
        
        # Store results in results
        results = results.append(cv_results, ignore_index=True)
    
    # End timer
    stop = datetime.datetime.now()
    print("ALL TEMPORAL SPLITS Time Elapsed:", stop - start)  
    
    return results

In [9]:
# Config: Dictionaries of models and hyperparameters
MODELS = {
    'LinearRegression': linear_model.LinearRegression(), 
    'Ridge': linear_model.Ridge(),
    'Lasso': linear_model.Lasso(), 
    'ElasticNet': linear_model.ElasticNet(),
    'DecisionTree': tree.DecisionTreeRegressor(),
    'RandomForest': ensemble.RandomForestRegressor(),
    'Boosting': ensemble.GradientBoostingRegressor()
}

GRID = {
    'LinearRegression': {},
    
    'Lasso': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
    
    'Ridge': {'max_iter': [10000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
    
    'ElasticNet': {'max_iter': [10000], 'random_state': [0],
                   'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                   'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]},
    
    'DecisionTree': {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'max_features': ['auto', 'log2', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10]},
    
    'RandomForest': {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'max_features': ['auto', 'log2', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10]},
    
    'Boosting': {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                 "min_samples_split": np.linspace(0.1, 0.5, 12),
                 "min_samples_leaf": np.linspace(0.1, 0.5, 12),
                 "max_depth":[3, 5, 8],
                 "max_features":["log2","sqrt"],
                 "criterion": ["friedman_mse",  "mae"],
                 "subsample": [0.5, 0.618, 0.8, 0.85, 0.9, 0.95, 1.0],
                 "n_estimators": [10]}
} 

In [10]:
def grid_search(df, models, grid, features, targets, num, cat):
    # Begin timer 
    start = datetime.datetime.now()
    results = pd.DataFrame()
    
    for model_key in models.keys(): 
        model_class = models[model_key]
        print(model_key)
        params = grid[model_key]
        print(params)
        
        results.append(forward_chaining(df, features, targets, model_class, params, num, cat))            

    # End timer
    stop = datetime.datetime.now()
    print("TOTAL TRAINING Time Elapsed:", stop - start)    
    
    return results

In [14]:
TARGETS = 'price_p_sf'

In [15]:
#r = grid_search(df, MODELS, GRID, FEATURES, TARGETS, NUM_VARS, CAT_VARS)

In [12]:
ln = forward_chaining(df, FEATURES, TARGETS, linear_model.LinearRegression(), {}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:00.156163
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:00.239414
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:00:00.319715
TRAIN YEARS:  [2014, 2015, 2016, 2017]




ALL FOLDS Time Elapsed: 0:00:00.410462
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:00:00.472298
ALL TEMPORAL SPLITS Time Elapsed: 0:00:25.057833


In [13]:
ln

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,{},1,0.818948,-1.188224e+23,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)",[2014],[2015],0.790161,0.676262
1,{},1,0.765356,-1.135634e+24,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015]",[2016],0.739858,0.703754
2,{},1,0.76483,-1.712271e+23,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016]",[2017],0.737013,0.774041
3,{},1,0.772619,-2.317124e+22,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016, 2017]",[2018],0.746323,0.797976
4,{},1,0.778325,-4.699478e+22,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.756984,0.788879


In [14]:
rid = forward_chaining(df, FEATURES, TARGETS, linear_model.Ridge(), {'max_iter': [1000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:00.376665
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:00.576172
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:00:00.830539
TRAIN YEARS:  [2014, 2015, 2016, 2017]
ALL FOLDS Time Elapsed: 0:00:01.038418
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:00:01.201408
ALL TEMPORAL SPLITS Time Elapsed: 0:00:28.810368


In [15]:
rid

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.809471,-1.510088,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)",[2014],[2015],0.782154,0.679778
1,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.760442,-1.081328,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015]",[2016],0.736288,0.712912
2,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.761054,-1.357256,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016]",[2017],0.73492,0.775508
3,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.769818,-0.870922,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016, 2017]",[2018],0.744878,0.802383
4,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.776134,-0.66354,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016, 2017, 2018]",[2019],0.755941,0.788358


In [16]:
las = forward_chaining(df, FEATURES, TARGETS, linear_model.Lasso(), {'max_iter': [1000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:03.030022
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:08.516211
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:00:15.428023
TRAIN YEARS:  [2014, 2015, 2016, 2017]




ALL FOLDS Time Elapsed: 0:00:17.743797
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]




ALL FOLDS Time Elapsed: 0:00:20.661713
ALL TEMPORAL SPLITS Time Elapsed: 0:01:30.281903


In [17]:
las

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.782305,-0.70099,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)",[2014],[2015],0.749277,0.655555
1,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.728687,-0.59677,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015]",[2016],0.706655,0.703544
2,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.730012,-0.91969,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016]",[2017],0.70818,0.760765
3,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.739608,-0.573897,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.718275,0.791188
4,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.74887,-0.309108,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.73114,0.782647


In [18]:
ela = forward_chaining(df, FEATURES, TARGETS, linear_model.ElasticNet(), {'max_iter': [1000], 'random_state': [0],
                   'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                   'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]






ALL FOLDS Time Elapsed: 0:00:13.706697
TRAIN YEARS:  [2014, 2015]






ALL FOLDS Time Elapsed: 0:00:40.417742
TRAIN YEARS:  [2014, 2015, 2016]






ALL FOLDS Time Elapsed: 0:01:10.402643
TRAIN YEARS:  [2014, 2015, 2016, 2017]






ALL FOLDS Time Elapsed: 0:01:22.095864
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]






ALL FOLDS Time Elapsed: 0:01:33.917145
ALL TEMPORAL SPLITS Time Elapsed: 0:05:25.770539


In [19]:
ela

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.782305,-0.70099,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)",[2014],[2015],0.749277,0.655555
1,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.728687,-0.59677,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015]",[2016],0.706655,0.703544
2,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.730012,-0.91969,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016]",[2017],0.70818,0.760765
3,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.739608,-0.573897,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.718275,0.791188
4,"{'alpha': 10, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.74887,-0.309108,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.73114,0.782647


In [20]:
rf = forward_chaining(df, FEATURES, TARGETS, ensemble.RandomForestRegressor(), {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'max_features': ['auto', 'log2', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10],
                     'n_estimators': [10]}, NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2014]






















































































ALL FOLDS Time Elapsed: 0:03:31.909889
TRAIN YEARS:  [2014, 2015]


























































































ALL FOLDS Time Elapsed: 0:08:18.881823
TRAIN YEARS:  [2014, 2015, 2016]


























































































ALL FOLDS Time Elapsed: 0:11:53.523877
TRAIN YEARS:  [2014, 2015, 2016, 2017]


























































































ALL FOLDS Time Elapsed: 0:17:47.505420
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]


























































































ALL FOLDS Time Elapsed: 0:21:54.160296
ALL TEMPORAL SPLITS Time Elapsed: 1:03:59.609984


In [21]:
rf

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 5}",1,0.858538,0.662995,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)",[2014],[2015],0.856522,0.734477
1,"{'max_depth': 90, 'max_features': 'auto', 'min_samples_leaf': 2, 'min_samples_split': 2}",1,0.90681,0.622157,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015]",[2016],0.910332,0.83625
2,"{'max_depth': 30, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}",1,0.95861,0.664019,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016]",[2017],0.962942,0.841596
3,"{'max_depth': 20, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2}",1,0.96893,0.713889,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.973454,0.864366
4,"{'max_depth': 70, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10}",1,0.893142,0.715957,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.87926,0.789399


In [23]:
boost = forward_chaining(df, FEATURES, TARGETS, ensemble.GradientBoostingRegressor(), {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                 "min_samples_split": np.linspace(0.1, 0.5, 3),
                 "min_samples_leaf": np.linspace(0.1, 0.5, 3),
                 "max_depth": [3, 5, 8],
                 "max_features": ["auto", "log2", "sqrt"],
                 "criterion": ["friedman_mse"],
                 "subsample": [0.5, 0.75, 1.0],
                 "n_estimators": [10]}, NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:03:45.610793
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:07:05.134863
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:10:56.948446
TRAIN YEARS:  [2014, 2015, 2016, 2017]




ALL FOLDS Time Elapsed: 0:17:27.476921
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:23:35.568734
ALL TEMPORAL SPLITS Time Elapsed: 1:03:17.935390


In [24]:
boost

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.30000000000000004, 'n_estimators': 10, 'subsample': 1.0}",1,0.397684,-2.944447,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)",[2014],[2015],0.391688,0.347288
1,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.385294,-2.279337,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015]",[2016],0.374486,0.400796
2,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.397747,-2.046747,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016]",[2017],0.386377,0.449639
3,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 3, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.413288,-1.572094,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.402514,0.483854
4,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.5, 'n_estimators': 10, 'subsample': 1.0}",1,0.429661,-1.143361,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.418118,0.505604


In [12]:
TARGETS = 'price_p_house'

In [17]:
#r = grid_search(df, MODELS, GRID, FEATURES, TARGETS, NUM_VARS, CAT_VARS)

In [18]:
ln = forward_chaining(df, FEATURES, TARGETS, linear_model.LinearRegression(), {}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:00.554248
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:00.953561
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:00:01.507644
TRAIN YEARS:  [2014, 2015, 2016, 2017]




ALL FOLDS Time Elapsed: 0:00:00.957846
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:00:01.720600
ALL TEMPORAL SPLITS Time Elapsed: 0:00:33.332213


In [19]:
ln

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,{},1,0.634876,-2.535861e+21,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)",[2014],[2015],0.585634,0.639241
1,{},1,0.666236,-7.34262e+21,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015]",[2016],0.63612,0.735081
2,{},1,0.69362,-3.094175e+22,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016]",[2017],0.671494,0.74023
3,{},1,0.707405,-9.326507e+21,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016, 2017]",[2018],0.68997,0.721431
4,{},1,0.71184,-8.835863e+20,"LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,\n normalize=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.699822,0.724247


In [20]:
rid = forward_chaining(df, FEATURES, TARGETS, linear_model.Ridge(), {'max_iter': [1000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:01.064960
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:02.333089
TRAIN YEARS:  [2014, 2015, 2016]
ALL FOLDS Time Elapsed: 0:00:02.394718
TRAIN YEARS:  [2014, 2015, 2016, 2017]
ALL FOLDS Time Elapsed: 0:00:02.694495
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:00:03.472305
ALL TEMPORAL SPLITS Time Elapsed: 0:00:39.422055


In [21]:
rid

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 1000, 'max_iter': 1000, 'random_state': 0}",1,0.444194,0.154528,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)",[2014],[2015],0.414091,0.643564
1,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.640108,0.347671,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015]",[2016],0.614098,0.73951
2,"{'alpha': 10, 'max_iter': 1000, 'random_state': 0}",1,0.677509,0.397468,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016]",[2017],0.658126,0.748928
3,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.663945,0.450855,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016, 2017]",[2018],0.651049,0.726203
4,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.67665,0.486798,"Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,\n normalize=False, random_state=None, solver='auto', tol=0.001)","[2014, 2015, 2016, 2017, 2018]",[2019],0.668793,0.722991


In [None]:
las = forward_chaining(df, FEATURES, TARGETS, linear_model.Lasso(), {'max_iter': [1000], 'random_state': [0],
              'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:00:09.425237
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:00:25.717115
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:02:11.810884
TRAIN YEARS:  [2014, 2015, 2016, 2017]




ALL FOLDS Time Elapsed: 0:02:36.298673
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]




ALL FOLDS Time Elapsed: 0:02:49.817125
ALL TEMPORAL SPLITS Time Elapsed: 0:08:43.028058


In [None]:
las

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 1000, 'max_iter': 1000, 'random_state': 0}",1,0.522026,0.106505,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)",[2014],[2015],0.481258,0.67645
1,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.651777,0.321803,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015]",[2016],0.62175,0.738688
2,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.680302,0.40262,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016]",[2017],0.659128,0.748955
3,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.695372,0.458028,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.679001,0.730499
4,"{'alpha': 100, 'max_iter': 1000, 'random_state': 0}",1,0.701075,0.493108,"Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,\n normalize=False, positive=False, precompute=False, random_state=None,\n selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.689731,0.727735


In [13]:
ela = forward_chaining(df, FEATURES, TARGETS, linear_model.ElasticNet(), {'max_iter': [1000], 'random_state': [0],
                   'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                   'l1_ratio': [0, 0.2, 0.4, 0.6, 0.8, 1]}, NUM_VARS, CAT_VARS)   

TRAIN YEARS:  [2014]






ALL FOLDS Time Elapsed: 0:00:19.763484
TRAIN YEARS:  [2014, 2015]






ALL FOLDS Time Elapsed: 0:01:08.713461
TRAIN YEARS:  [2014, 2015, 2016]






ALL FOLDS Time Elapsed: 0:05:33.547939
TRAIN YEARS:  [2014, 2015, 2016, 2017]






ALL FOLDS Time Elapsed: 0:06:29.021749
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]






ALL FOLDS Time Elapsed: 0:07:22.420944
ALL TEMPORAL SPLITS Time Elapsed: 0:21:21.709146


In [14]:
ela

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'alpha': 1, 'l1_ratio': 0.6, 'max_iter': 1000, 'random_state': 0}",1,0.455093,0.167606,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)",[2014],[2015],0.417158,0.648173
1,"{'alpha': 0.01, 'l1_ratio': 0.8, 'max_iter': 1000, 'random_state': 0}",1,0.64526,0.347684,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015]",[2016],0.615838,0.740293
2,"{'alpha': 100, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.680302,0.40262,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016]",[2017],0.659128,0.748955
3,"{'alpha': 100, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.695372,0.458028,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.679001,0.730499
4,"{'alpha': 100, 'l1_ratio': 1, 'max_iter': 1000, 'random_state': 0}",1,0.701075,0.493108,"ElasticNet(alpha=1.0, copy_X=True, fit_intercept=True, l1_ratio=0.5,\n max_iter=1000, normalize=False, positive=False, precompute=False,\n random_state=None, selection='cyclic', tol=0.0001, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.689731,0.727735


In [15]:
rf = forward_chaining(df, FEATURES, TARGETS, ensemble.RandomForestRegressor(), {'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
                     'max_features': ['auto', 'log2', 'sqrt'],
                     'min_samples_leaf': [1, 2, 4],
                     'min_samples_split': [2, 5, 10],
                     'n_estimators': [10]}, NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:02:39.739044
TRAIN YEARS:  [2014, 2015]
ALL FOLDS Time Elapsed: 0:05:47.274969
TRAIN YEARS:  [2014, 2015, 2016]
ALL FOLDS Time Elapsed: 0:11:25.780755
TRAIN YEARS:  [2014, 2015, 2016, 2017]
ALL FOLDS Time Elapsed: 0:13:15.255625
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:16:38.404129
ALL TEMPORAL SPLITS Time Elapsed: 0:50:19.069200


In [16]:
rf

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'max_depth': 100, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}",1,0.90836,0.556431,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)",[2014],[2015],0.941208,0.197087
1,"{'max_depth': 90, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}",1,0.781139,0.617078,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015]",[2016],0.776277,0.77298
2,"{'max_depth': 40, 'max_features': 'auto', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 10}",1,0.838749,0.632002,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016]",[2017],0.82324,0.756514
3,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 10}",1,0.866749,0.650428,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.85984,0.733458
4,"{'max_depth': 10, 'max_features': 'auto', 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 10}",1,0.840163,0.650294,"RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,\n max_features='auto', max_leaf_nodes=None,\n min_impurity_decrease=0.0, min_impurity_split=None,\n min_samples_leaf=1, min_samples_split=2,\n min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,\n oob_score=False, random_state=None, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.837477,0.705727


In [17]:
boost = forward_chaining(df, FEATURES, TARGETS, ensemble.GradientBoostingRegressor(), {"learning_rate": [0.01, 0.025, 0.05, 0.075, 0.1, 0.15, 0.2],
                 "min_samples_split": np.linspace(0.1, 0.5, 3),
                 "min_samples_leaf": np.linspace(0.1, 0.5, 3),
                 "max_depth": [3, 5, 8],
                 "max_features": ["auto", "log2", "sqrt"],
                 "criterion": ["friedman_mse"],
                 "subsample": [0.5, 0.75, 1.0],
                 "n_estimators": [10]}, NUM_VARS, CAT_VARS) 

TRAIN YEARS:  [2014]




ALL FOLDS Time Elapsed: 0:03:47.307614
TRAIN YEARS:  [2014, 2015]




ALL FOLDS Time Elapsed: 0:06:59.853003
TRAIN YEARS:  [2014, 2015, 2016]




ALL FOLDS Time Elapsed: 0:10:40.834236
TRAIN YEARS:  [2014, 2015, 2016, 2017]
ALL FOLDS Time Elapsed: 0:14:10.906503
TRAIN YEARS:  [2014, 2015, 2016, 2017, 2018]
ALL FOLDS Time Elapsed: 0:18:10.728207
ALL TEMPORAL SPLITS Time Elapsed: 0:54:17.170616


In [18]:
boost

Unnamed: 0,params,rank_test_score,mean_train_CV_score,mean_test_CV_score,Model,Train Years,Test Years,train_mean_score,test_mean_score
0,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 0.75}",1,0.501581,0.483335,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)",[2014],[2015],0.454205,0.669186
1,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.586886,0.5131,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015]",[2016],0.556198,0.674719
2,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.619789,0.514804,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016]",[2017],0.596509,0.688073
3,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 5, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.637482,0.535094,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017]",[2018],0.61798,0.699262
4,"{'criterion': 'friedman_mse', 'learning_rate': 0.2, 'max_depth': 8, 'max_features': 'auto', 'min_samples_leaf': 0.1, 'min_samples_split': 0.1, 'n_estimators': 10, 'subsample': 1.0}",1,0.648896,0.545903,"GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,\n learning_rate=0.1, loss='ls', max_depth=3, max_features=None,\n max_leaf_nodes=None, min_impurity_decrease=0.0,\n min_impurity_split=None, min_samples_leaf=1,\n min_samples_split=2, min_weight_fraction_leaf=0.0,\n n_estimators=100, n_iter_no_change=None, presort='auto',\n random_state=None, subsample=1.0, tol=0.0001,\n validation_fraction=0.1, verbose=0, warm_start=False)","[2014, 2015, 2016, 2017, 2018]",[2019],0.633147,0.707291
