### Imputing and Cleaning Zillow Rent Index Table

In [182]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns
import re

In [183]:
x2011 = pd.read_csv('zip_codes_2011.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2012 = pd.read_csv('zip_codes_2012.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2013 = pd.read_csv('zip_codes_2013.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2014 = pd.read_csv('zip_codes_2014.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2015 = pd.read_csv('zip_codes_2015.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2016 = pd.read_csv('zip_codes_2016.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)
x2017 = pd.read_csv('zip_codes_2017.csv').drop(['do_date'], axis = 1).dropna(how = 'all', axis = 1)

x2011.columns = x2011.columns.str.replace('geo_id', 'zip')
x2012.columns = x2012.columns.str.replace('geo_id', 'zip')
x2013.columns = x2013.columns.str.replace('geo_id', 'zip')
x2014.columns = x2014.columns.str.replace('geo_id', 'zip')
x2015.columns = x2015.columns.str.replace('geo_id', 'zip')
x2016.columns = x2016.columns.str.replace('geo_id', 'zip')
x2017.columns = x2017.columns.str.replace('geo_id', 'zip')

# Select only the columns that appear in all years

In [184]:
mutual_columns = set(x2011.columns).intersection(set(x2012.columns)).intersection(set(x2013.columns)).\
intersection(set(x2014.columns)).intersection(set(x2015.columns)).intersection(set(x2016.columns)).\
intersection(set(x2017.columns))

In [185]:
x2011 = x2011[mutual_columns]
x2012 = x2012[mutual_columns]
x2013 = x2013[mutual_columns]
x2014 = x2014[mutual_columns]
x2015 = x2015[mutual_columns]
x2016 = x2016[mutual_columns]
x2017 = x2017[mutual_columns]

# Join years 2011-2017 into one dataframe

In [186]:
census = x2011.merge(x2012, on = 'zip', suffixes = ('_2011','_2012')).\
merge(x2013, on = 'zip').merge(x2014, on = 'zip', suffixes = ('_2013','_2014')).\
merge(x2015, on = 'zip').merge(x2016, on = 'zip', suffixes = ('_2015','_2016')).set_index('zip')

census.head()

Unnamed: 0_level_0,two_parents_mother_in_labor_force_families_with_young_children_2011,family_households_2011,commuters_drove_alone_2011,in_grades_1_to_4_2011,income_45000_49999_2011,income_50000_59999_2011,vacant_housing_units_for_sale_2011,male_21_2011,male_45_64_less_than_9_grade_2011,mortgaged_housing_units_2011,...,male_22_to_24_2016,graduate_professional_degree_2016,rent_burden_not_computed_2016,rent_under_10_percent_2016,unemployed_pop_2016,occupation_natural_resources_construction_maintenance_2016,mobile_homes_2016,male_45_64_high_school_2016,female_30_to_34_2016,income_less_10000_2016
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
99363,0,0,0,0,0,0,0,0,0,0,...,0,0.0,0,0,26,29,11,0,26,0
14881,0,0,0,0,0,0,0,0,0,0,...,0,14.0,0,0,23,0,77,0,0,0
91046,0,15,0,0,0,0,0,0,0,0,...,0,0.0,0,0,0,0,0,0,0,0
35052,0,0,0,0,0,0,0,0,0,0,...,0,7.0,0,0,0,0,0,0,0,0
20317,0,0,0,0,0,13,0,0,0,0,...,0,92.0,27,29,0,0,0,5,0,81


# Zillow Rent Index (Clean, Impute, Take Median)

In [187]:
zillow = pd.read_csv('Zip_Zri_AllHomesPlusMultifamily.csv')
zillow.columns = zillow.columns.str.replace('RegionName', 'zip')

In [188]:
zillow.head()

Unnamed: 0,RegionID,zip,City,State,Metro,CountyName,SizeRank,2010-09,2010-10,2010-11,...,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12,2020-01
0,61639,10025,New York,NY,New York-Newark-Jersey City,New York County,1,3031.0,3058.0,3031.0,...,3785.0,3788.0,3786.0,3784.0,3766.0,3779.0,3843.0,3873.0,3835.0,
1,84654,60657,Chicago,IL,Chicago-Naperville-Elgin,Cook County,2,1790.0,1787.0,1784.0,...,2039.0,2070.0,2105.0,2140.0,2168.0,2185.0,2125.0,,2053.0,1996.0
2,61637,10023,New York,NY,New York-Newark-Jersey City,New York County,3,3269.0,3304.0,3320.0,...,3874.0,3898.0,3917.0,3929.0,3931.0,3963.0,,4123.0,4079.0,
3,91982,77494,Katy,TX,Houston-The Woodlands-Sugar Land,Harris County,4,1547.0,1549.0,1560.0,...,1765.0,1755.0,1751.0,1752.0,1754.0,1759.0,1764.0,1769.0,1776.0,1778.0
4,84616,60614,Chicago,IL,Chicago-Naperville-Elgin,Cook County,5,1922.0,1925.0,1921.0,...,2245.0,2289.0,2332.0,2372.0,2398.0,2412.0,2348.0,2356.0,2311.0,


In [189]:
zri2011 = pd.concat([zillow.zip,zillow.iloc[:,11:23]], axis = 1).set_index('zip').dropna(how = 'all')
zri2012 = pd.concat([zillow.zip,zillow.iloc[:,23:35]], axis = 1).set_index('zip').dropna(how = 'all')
zri2013 = pd.concat([zillow.zip,zillow.iloc[:,35:47]], axis = 1).set_index('zip').dropna(how = 'all')
zri2014 = pd.concat([zillow.zip,zillow.iloc[:,47:59]], axis = 1).set_index('zip').dropna(how = 'all')
zri2015 = pd.concat([zillow.zip,zillow.iloc[:,59:71]], axis = 1).set_index('zip').dropna(how = 'all')
zri2016 = pd.concat([zillow.zip,zillow.iloc[:,71:83]], axis = 1).set_index('zip').dropna(how = 'all')
zri2017 = pd.concat([zillow.zip,zillow.iloc[:,83:95]], axis = 1).set_index('zip').dropna(how = 'all')
zri2018 = pd.concat([zillow.zip,zillow.iloc[:,95:107]], axis = 1).set_index('zip').dropna(how = 'all')
zri2019 = pd.concat([zillow.zip,zillow.iloc[:,107:119]], axis = 1).set_index('zip').dropna(how = 'all')

## Delete Any year that has more than 3 missing values

In [190]:
zri2011 = zri2011[zri2011.isna().sum(axis = 1) <= 3]
zri2012 = zri2012[zri2012.isna().sum(axis = 1) <= 3]
zri2013 = zri2013[zri2013.isna().sum(axis = 1) <= 3]
zri2014 = zri2014[zri2014.isna().sum(axis = 1) <= 3]
zri2015 = zri2015[zri2015.isna().sum(axis = 1) <= 3]
zri2016 = zri2016[zri2016.isna().sum(axis = 1) <= 3]
zri2017 = zri2017[zri2017.isna().sum(axis = 1) <= 3]
zri2018 = zri2018[zri2018.isna().sum(axis = 1) <= 3]
zri2019 = zri2019[zri2019.isna().sum(axis = 1) <= 3]

In [191]:
print(zri2011.shape[0])
print(zri2012.shape[0])
print(zri2013.shape[0])
print(zri2014.shape[0])
print(zri2015.shape[0])
print(zri2016.shape[0])
print(zri2017.shape[0])
print(zri2018.shape[0])
print(zri2019.shape[0])

10635
10997
11065
11856
12193
12437
12539
12616
12579


## Linearly Interpolate Missing Values

In [192]:
zri2011 = zri2011.interpolate(method='linear',limit_direction='both', axis=1)
zri2012 = zri2012.interpolate(method='linear',limit_direction='both', axis=1)
zri2013 = zri2013.interpolate(method='linear',limit_direction='both', axis=1)
zri2014 = zri2014.interpolate(method='linear',limit_direction='both', axis=1)
zri2015 = zri2015.interpolate(method='linear',limit_direction='both', axis=1)
zri2016 = zri2016.interpolate(method='linear',limit_direction='both', axis=1)
zri2017 = zri2017.interpolate(method='linear',limit_direction='both', axis=1)
zri2018 = zri2018.interpolate(method='linear',limit_direction='both', axis=1)
zri2019 = zri2019.interpolate(method='linear',limit_direction='both', axis=1)

In [193]:
zillow_monthly = pd.concat([zri2011,zri2012,zri2013, zri2014, zri2015, zri2016, zri2017, zri2018, zri2019], join = 'inner', axis = 1)


In [194]:
zillow_monthly.shape[0]

8752

In [195]:
zillow_2011 = zri2011.median(axis = 1)
zillow_2012 = zri2012.median(axis = 1)
zillow_2013 = zri2013.median(axis = 1)
zillow_2014 = zri2014.median(axis = 1)
zillow_2015 = zri2015.median(axis = 1)
zillow_2016 = zri2016.median(axis = 1)
zillow_2017 = zri2017.median(axis = 1)
zillow_2018 = zri2018.median(axis = 1)
zillow_2019 = zri2019.median(axis = 1)

In [196]:
zillow_median = pd.concat([zillow_2011,zillow_2012,zillow_2013,zillow_2014,zillow_2015,zillow_2016,zillow_2017,zillow_2018,zillow_2019], join = 'inner',axis = 1)
zillow_median.columns = ['zri_2011','zri_2012','zri_2013', 'zri_2014', 'zri_2015', 'zri_2016', 'zri_2017', 'zri_2018', 'zri_2019']



In [197]:
zillow_median.head()

Unnamed: 0_level_0,zri_2011,zri_2012,zri_2013,zri_2014,zri_2015,zri_2016,zri_2017,zri_2018,zri_2019
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
60657,1773.5,1882.0,1966.0,2003.5,2075.0,2143.0,2121.5,2091.0,2079.5
77494,1591.0,1623.5,1780.0,1944.5,1918.5,1826.5,1763.0,1789.5,1764.5
60614,1882.0,2016.5,2138.0,2197.5,2270.0,2349.0,2332.5,2309.0,2321.5
77084,1075.0,1107.0,1209.5,1320.5,1422.0,1414.5,1371.0,1417.5,1400.0
79936,987.0,1009.0,960.0,981.0,1016.5,1010.5,992.5,974.5,943.5


### Try Mean Instead of Median

In [198]:
zillow_2011 = zri2011.mean(axis = 1)
zillow_2012 = zri2012.mean(axis = 1)
zillow_2013 = zri2013.mean(axis = 1)
zillow_2014 = zri2014.mean(axis = 1)
zillow_2015 = zri2015.mean(axis = 1)
zillow_2016 = zri2016.mean(axis = 1)
zillow_2017 = zri2017.mean(axis = 1)
zillow_2018 = zri2018.mean(axis = 1)
zillow_2019 = zri2019.mean(axis = 1)

In [199]:
zillow_mean = pd.concat([zillow_2011,zillow_2012,zillow_2013,zillow_2014,zillow_2015,zillow_2016,zillow_2017,zillow_2018,zillow_2019], join = 'inner',axis = 1)
zillow_mean.columns = ['zri_2011','zri_2012','zri_2013', 'zri_2014', 'zri_2015', 'zri_2016', 'zri_2017', 'zri_2018', 'zri_2019']



In [200]:
zillow_mean.head()

Unnamed: 0_level_0,zri_2011,zri_2012,zri_2013,zri_2014,zri_2015,zri_2016,zri_2017,zri_2018,zri_2019
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
60657,1787.25,1866.833333,1955.666667,2005.666667,2073.5,2140.416667,2127.416667,2098.333333,2085.833333
77494,1584.5,1630.583333,1786.833333,1956.833333,1921.916667,1819.833333,1770.083333,1788.166667,1763.416667
60614,1897.583333,2014.666667,2148.0,2202.583333,2271.0,2346.5,2344.0,2314.25,2306.583333
77084,1064.666667,1116.333333,1205.916667,1327.916667,1417.166667,1414.083333,1372.75,1414.416667,1404.25
79936,999.166667,1005.833333,961.666667,984.916667,1014.833333,1011.916667,992.583333,973.916667,947.833333


# Join with Census (drop all the NAs for simplicity and purity)

In [201]:
total = pd.concat([zillow_mean,census], join = 'inner', axis = 1).dropna(how = 'any')

# Create Train and Test Datasets

In [202]:
x_train = pd.concat([total.iloc[:,2:5],total.iloc[:,9:708]], join = 'inner', axis = 1)
y_train = total.iloc[:,5]
x_test = pd.concat([total.iloc[:,3:6],total.iloc[:,242:941]], join = 'inner', axis = 1)
y_test = total.iloc[:,6]

# Log Y and X

In [203]:
x_train = np.log(x_train+1)
x_test = np.log(x_test+1)
y_train = np.log(y_train)
y_test = np.log(y_test)


# Regressions (OLS, Lasso, Ridge, Gradient Boosting Regressor)

In [204]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.ensemble import GradientBoostingRegressor


model_ols = LinearRegression()
model_lasso = Lasso()
model_ridge = Ridge()
model_gradient = GradientBoostingRegressor()

In [205]:
model_ols.fit(x_train, y_train)
model_lasso.fit(x_train, y_train)
model_ridge.fit(x_train, y_train)
model_gradient.fit(x_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [206]:
#MEDIAN
print('Simple Linear Train Score: %.3f'%model_ols.score(x_train, y_train))
print('Lasso Train Score: %.3f'%model_lasso.score(x_train, y_train))
print('Ridge Train Score: %.3f'%model_ridge.score(x_train, y_train))
print('Gradient Regressor Train Score: %.3f'%model_gradient.score(x_train, y_train))

Simple Linear Train Score: 0.995
Lasso Train Score: 0.000
Ridge Train Score: 0.995
Gradient Regressor Train Score: 0.995


In [207]:
#MEAN

print('Simple Linear Train Score: %.3f'%model_ols.score(x_train, y_train))
print('Lasso Train Score: %.3f'%model_lasso.score(x_train, y_train))
print('Ridge Train Score: %.3f'%model_ridge.score(x_train, y_train))
print('Gradient Regressor Train Score: %.3f'%model_gradient.score(x_train, y_train))

Simple Linear Train Score: 0.995
Lasso Train Score: 0.000
Ridge Train Score: 0.995
Gradient Regressor Train Score: 0.995


### Test Score

In [208]:
#MEDIAN

print('Simple Linear Test Score: %.3f'%model_ols.score(x_test, y_test))
print('Lasso Test Score: %.3f'%model_lasso.score(x_test, y_test))
print('Ridge Test Score: %.3f'%model_ridge.score(x_test, y_test))
print('Gradient Regressor Test Score: %.3f'%model_gradient.score(x_test, y_test))

Simple Linear Test Score: 0.992
Lasso Test Score: -0.001
Ridge Test Score: 0.991
Gradient Regressor Test Score: 0.991


In [209]:
#MEAN

print('Simple Linear Test Score: %.3f'%model_ols.score(x_test, y_test))
print('Lasso Test Score: %.3f'%model_lasso.score(x_test, y_test))
print('Ridge Test Score: %.3f'%model_ridge.score(x_test, y_test))
print('Gradient Regressor Test Score: %.3f'%model_gradient.score(x_test, y_test))

Simple Linear Test Score: 0.992
Lasso Test Score: -0.001
Ridge Test Score: 0.991
Gradient Regressor Test Score: 0.991


### Kaggle Score

In [210]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [211]:
#MEDIAN

print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ols.predict(x_test), y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(model_lasso.predict(x_test), y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ridge.predict(x_test), y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(model_gradient.predict(x_test), y_test)))

Simple Linear Kaggle Score: 0.03276
Lasso Kaggle Score: 0.35816
Ridge Kaggle Score: 0.03415
Gradient Regressor Kaggle Score: 0.03368


In [212]:
#MEAN

print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ols.predict(x_test), y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(model_lasso.predict(x_test), y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ridge.predict(x_test), y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(model_gradient.predict(x_test), y_test)))

Simple Linear Kaggle Score: 0.03276
Lasso Kaggle Score: 0.35816
Ridge Kaggle Score: 0.03415
Gradient Regressor Kaggle Score: 0.03368


# Feature Importance (top 20)

In [213]:
sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]

[('zri_2015', 0.9948638707055147),
 ('owner_occupied_housing_units_upper_value_quartile_2011',
  0.0015618771865639365),
 ('zri_2014', 0.0006057610455675675),
 ('median_rent_2011', 0.0005570436492386293),
 ('owner_occupied_housing_units_median_value_2011', 0.00032814561426754043),
 ('median_rent_2013', 0.00031326239399802173),
 ('median_rent_2012', 0.0001972205529216066),
 ('percent_income_spent_on_rent_2013', 9.005914230185862e-05),
 ('zri_2013', 5.6846575475015904e-05),
 ('median_income_2011', 5.23907287779398e-05),
 ('amerindian_including_hispanic_2013', 5.012168469714293e-05),
 ('armed_forces_2011', 5.002187004812972e-05),
 ('income_per_capita_2013', 4.521090974996427e-05),
 ('armed_forces_2013', 4.277123257486114e-05),
 ('amerindian_including_hispanic_2011', 3.951963863258817e-05),
 ('dwellings_3_to_4_units_2013', 3.7942375758645516e-05),
 ('employed_agriculture_forestry_fishing_hunting_mining_2013',
  3.7622373438149226e-05),
 ('hispanic_male_45_54_2011', 3.59644138298343e-05),
 

In [214]:
pd.DataFrame({'feature':[i[0] for i in sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]],
             'score':[i[1] for i in sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]]})

Unnamed: 0,feature,score
0,zri_2015,0.994864
1,owner_occupied_housing_units_upper_value_quart...,0.001562
2,zri_2014,0.000606
3,median_rent_2011,0.000557
4,owner_occupied_housing_units_median_value_2011,0.000328
5,median_rent_2013,0.000313
6,median_rent_2012,0.000197
7,percent_income_spent_on_rent_2013,9e-05
8,zri_2013,5.7e-05
9,median_income_2011,5.2e-05


In [215]:
pd.DataFrame({'feature':[i[0] for i in sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]],
             'score':[i[1] for i in sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]]})

Unnamed: 0,feature,score
0,zri_2015,0.994864
1,owner_occupied_housing_units_upper_value_quart...,0.001562
2,zri_2014,0.000606
3,median_rent_2011,0.000557
4,owner_occupied_housing_units_median_value_2011,0.000328
5,median_rent_2013,0.000313
6,median_rent_2012,0.000197
7,percent_income_spent_on_rent_2013,9e-05
8,zri_2013,5.7e-05
9,median_income_2011,5.2e-05


# Regression without Zillow Rent Indexes from Previous Years

In [216]:
x_train_new = total.iloc[:,9:708]
x_test_new = total.iloc[:,242:941]

In [217]:
x_train_new = np.log(x_train_new+1)
x_test_new = np.log(x_test_new+1)

In [218]:
y_train_new = y_train.copy()
y_test_new = y_test.copy()

In [219]:
model_ols_new = LinearRegression()
model_lasso_new = Lasso()
model_ridge_new = Ridge()
model_gradient_new = GradientBoostingRegressor()

In [220]:
model_ols_new.fit(x_train_new, y_train_new)
model_lasso_new.fit(x_train_new, y_train_new)
model_ridge_new.fit(x_train_new, y_train_new)
model_gradient_new.fit(x_train_new, y_train_new)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### New Train Scores

In [221]:
print('Simple Linear Train Score: %.3f'%model_ols_new.score(x_train_new, y_train_new))
print('Lasso Train Score: %.3f'%model_lasso_new.score(x_train_new, y_train_new))
print('Ridge Train Score: %.3f'%model_ridge_new.score(x_train_new, y_train_new))
print('Gradient Regressor Train Score: %.3f'%model_gradient_new.score(x_train_new, y_train_new))

Simple Linear Train Score: 0.892
Lasso Train Score: 0.000
Ridge Train Score: 0.891
Gradient Regressor Train Score: 0.904


### New Test Scores

In [222]:
print('Simple Linear Test Score: %.3f'%model_ols_new.score(x_test_new, y_test_new))
print('Lasso Test Score: %.3f'%model_lasso_new.score(x_test_new, y_test_new))
print('Ridge Test Score: %.3f'%model_ridge_new.score(x_test_new, y_test_new))
print('Gradient Regressor Test Score: %.3f'%model_gradient_new.score(x_test_new, y_test_new))

Simple Linear Test Score: 0.875
Lasso Test Score: -0.001
Ridge Test Score: 0.877
Gradient Regressor Test Score: 0.892


### New Kaggle Scores

In [223]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ols_new.predict(x_test_new), y_test_new)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(model_lasso_new.predict(x_test_new), y_test_new)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ridge_new.predict(x_test_new), y_test_new)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(model_gradient_new.predict(x_test_new), y_test_new)))

Simple Linear Kaggle Score: 0.12652
Lasso Kaggle Score: 0.35816
Ridge Kaggle Score: 0.12544
Gradient Regressor Kaggle Score: 0.11787


### New Features Importances

In [224]:
sorted(list(zip(x_train_new,model_gradient_new.feature_importances_)), key = lambda x: x[1],reverse = True)[0:20]

[('owner_occupied_housing_units_median_value_2011', 0.6665790675283833),
 ('median_rent_2013', 0.10221472594656697),
 ('owner_occupied_housing_units_upper_value_quartile_2011',
  0.05794873412187845),
 ('median_rent_2012', 0.030662086813726157),
 ('owner_occupied_housing_units_lower_value_quartile_2011',
  0.029695178909727428),
 ('median_rent_2011', 0.014280569201056812),
 ('owner_occupied_housing_units_lower_value_quartile_2012',
  0.008931338014579994),
 ('owner_occupied_housing_units_lower_value_quartile_2013',
  0.007092196549106286),
 ('owner_occupied_housing_units_upper_value_quartile_2013',
  0.005874948545046292),
 ('hispanic_male_55_64_2011', 0.0035127789618525944),
 ('hispanic_male_55_64_2013', 0.0031262242919087107),
 ('hispanic_male_45_54_2013', 0.003115499904768904),
 ('owner_occupied_housing_units_median_value_2013', 0.0027096855050482292),
 ('hispanic_male_45_54_2011', 0.002427924014860179),
 ('commute_60_89_mins_2013', 0.0019723963355146178),
 ('not_us_citizen_pop_2013

In [225]:
pd.DataFrame({'feature':[i[0] for i in sorted(list(zip(x_train_new,model_gradient_new.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]],
             'score':[i[1] for i in sorted(list(zip(x_train_new,model_gradient_new.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]]})

Unnamed: 0,feature,score
0,owner_occupied_housing_units_median_value_2011,0.666579
1,median_rent_2013,0.102215
2,owner_occupied_housing_units_upper_value_quart...,0.057949
3,median_rent_2012,0.030662
4,owner_occupied_housing_units_lower_value_quart...,0.029695
5,median_rent_2011,0.014281
6,owner_occupied_housing_units_lower_value_quart...,0.008931
7,owner_occupied_housing_units_lower_value_quart...,0.007092
8,owner_occupied_housing_units_upper_value_quart...,0.005875
9,hispanic_male_55_64_2011,0.003513


# Try predicting 2019 (based on 2014, 2015, 2016) 
## none of the 2014, 2015, 2016 were used in original training
## predicting on entirely new data

In [226]:
total.iloc[:,5:8]

Unnamed: 0_level_0,zri_2016,zri_2017,zri_2018
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
60657,2140.416667,2127.416667,2098.333333
77494,1819.833333,1770.083333,1788.166667
60614,2346.500000,2344.000000,2314.250000
77084,1414.083333,1372.750000,1414.416667
79936,1011.916667,992.583333,973.916667
...,...,...,...
6763,1455.750000,1417.583333,1497.666667
3836,1251.291667,1269.000000,1408.833333
8740,1545.250000,1585.833333,1624.583333
56443,1311.250000,1282.916667,1317.250000


In [227]:
x_test_2019 = pd.concat([total.iloc[:,5:8],total.iloc[:,708:]], join = 'inner', axis = 1)
y_test_2019 = total.iloc[:,8]

### Log Them

In [228]:
x_test_2019 = np.log(x_test_2019+1)
y_test_2019 = np.log(y_test_2019)

### Scores

In [229]:
#MEAN
print('Simple Linear Test Score: %.3f'%model_ols.score(x_test_2019, y_test_2019))
print('Lasso Test Score: %.3f'%model_lasso.score(x_test_2019, y_test_2019))
print('Ridge Test Score: %.3f'%model_ridge.score(x_test_2019, y_test_2019))
print('Gradient Regressor Test Score: %.3f'%model_gradient.score(x_test_2019, y_test_2019))

Simple Linear Test Score: 0.991
Lasso Test Score: -0.009
Ridge Test Score: 0.991
Gradient Regressor Test Score: 0.990


### Kaggle Scores

In [230]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ols.predict(x_test_2019), y_test_2019)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(model_lasso.predict(x_test_2019), y_test_2019)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ridge.predict(x_test_2019), y_test_2019)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(model_gradient.predict(x_test_2019), y_test_2019)))

Simple Linear Kaggle Score: 0.03381
Lasso Kaggle Score: 0.36112
Ridge Kaggle Score: 0.03476
Gradient Regressor Kaggle Score: 0.03664


# Predictions just for New York City
## Eugene Told us that performance for separate regions will be worse that for the dataset overall

In [231]:
nyc_zip = pd.read_csv('zip_borough.csv').set_index('zip')

In [232]:
x_test_nyc = pd.concat([nyc_zip, x_test_2019], join='inner', axis = 1).drop(['borough'], axis = 1)

In [233]:
y_test_nyc = pd.concat([nyc_zip, y_test_2019] ,join = 'inner',axis = 1).drop(['borough'], axis = 1)

In [234]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ols.predict(x_test_nyc), y_test_nyc)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(model_lasso.predict(x_test_nyc), y_test_nyc)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(model_ridge.predict(x_test_nyc), y_test_nyc)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(model_gradient.predict(x_test_nyc), y_test_nyc)))

Simple Linear Kaggle Score: 0.03288
Lasso Kaggle Score: 0.48209
Ridge Kaggle Score: 0.03630
Gradient Regressor Kaggle Score: 0.03447


In [235]:
[i[0] for i in sorted(list(zip(x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[0:20]]

['zri_2015',
 'owner_occupied_housing_units_upper_value_quartile_2011',
 'zri_2014',
 'median_rent_2011',
 'owner_occupied_housing_units_median_value_2011',
 'median_rent_2013',
 'median_rent_2012',
 'percent_income_spent_on_rent_2013',
 'zri_2013',
 'median_income_2011',
 'amerindian_including_hispanic_2013',
 'armed_forces_2011',
 'income_per_capita_2013',
 'armed_forces_2013',
 'amerindian_including_hispanic_2011',
 'dwellings_3_to_4_units_2013',
 'employed_agriculture_forestry_fishing_hunting_mining_2013',
 'hispanic_male_45_54_2011',
 'employed_agriculture_forestry_fishing_hunting_mining_2012',
 'amerindian_pop_2013']

In [236]:
predictions = pd.DataFrame(model_gradient.predict(x_test_2019),columns = ['prediction'], index = y_test_2019.index)

In [237]:
pred_NYC = pd.concat([nyc_zip, predictions], join = 'inner', axis = 1).drop(['borough'], axis = 1)

In [238]:
sqrt(mean_squared_error(pred_NYC, y_test_nyc))

0.03447219010230633

## Predicting Zillow Rent Index using `only` Zillow Rent Index from previous years

In [239]:
zillow_mean.head()

Unnamed: 0_level_0,zri_2011,zri_2012,zri_2013,zri_2014,zri_2015,zri_2016,zri_2017,zri_2018,zri_2019
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
60657,1787.25,1866.833333,1955.666667,2005.666667,2073.5,2140.416667,2127.416667,2098.333333,2085.833333
77494,1584.5,1630.583333,1786.833333,1956.833333,1921.916667,1819.833333,1770.083333,1788.166667,1763.416667
60614,1897.583333,2014.666667,2148.0,2202.583333,2271.0,2346.5,2344.0,2314.25,2306.583333
77084,1064.666667,1116.333333,1205.916667,1327.916667,1417.166667,1414.083333,1372.75,1414.416667,1404.25
79936,999.166667,1005.833333,961.666667,984.916667,1014.833333,1011.916667,992.583333,973.916667,947.833333


In [240]:
zri_x_train = zillow_mean.iloc[:,0:3]  #2011-2013
zri_y_train = zillow_mean.iloc[:,5]    #2016
zri_x_train.head()

Unnamed: 0_level_0,zri_2011,zri_2012,zri_2013
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
60657,1787.25,1866.833333,1955.666667
77494,1584.5,1630.583333,1786.833333
60614,1897.583333,2014.666667,2148.0
77084,1064.666667,1116.333333,1205.916667
79936,999.166667,1005.833333,961.666667


In [241]:
zri_x_test = zillow_mean.iloc[:,3:6] #2014-2017
zri_y_test = zillow_mean.iloc[:,8]   #2019
zri_x_test.head()    

Unnamed: 0_level_0,zri_2014,zri_2015,zri_2016
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
60657,2005.666667,2073.5,2140.416667
77494,1956.833333,1921.916667,1819.833333
60614,2202.583333,2271.0,2346.5
77084,1327.916667,1417.166667,1414.083333
79936,984.916667,1014.833333,1011.916667


In [242]:
zri_x_train = np.log(zri_x_train+1)
zri_x_test = np.log(zri_x_test+1)
zri_y_train = np.log(zri_y_train)
zri_y_test = np.log(zri_y_test)

In [243]:
zri_model_ols = LinearRegression()
zri_model_lasso = Lasso()
zri_model_ridge = Ridge()
zri_model_gradient = GradientBoostingRegressor()

zri_model_ols.fit(zri_x_train, zri_y_train)
zri_model_lasso.fit(zri_x_train, zri_y_train)
zri_model_ridge.fit(zri_x_train, zri_y_train)
zri_model_gradient.fit(zri_x_train, zri_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [244]:
#MEAN

print("Simple Linear Train Score: %.3f" %zri_model_ols.score(zri_x_train, zri_y_train))
print("Lasso Train Score: %.3f" %zri_model_lasso.score(zri_x_train, zri_y_train))
print("Ridge Train Score: %.3f" %zri_model_ridge.score(zri_x_train, zri_y_train))
print("Gradient Regressor Train Score: %.3f" %zri_model_gradient.score(zri_x_train, zri_y_train))

Simple Linear Train Score: 0.957
Lasso Train Score: 0.000
Ridge Train Score: 0.957
Gradient Regressor Train Score: 0.961


### Test Score

In [245]:
#MEAN

print("Simple Linear Test Score: %.3f" %zri_model_ols.score(zri_x_test, zri_y_test))
print("Lasso Test Score: %.3f" %zri_model_lasso.score(zri_x_test, zri_y_test))
print("Ridge Test Score: %.3f" %zri_model_ridge.score(zri_x_test, zri_y_test))
print("Gradient Regressor Test Score: %.3f" %zri_model_gradient.score(zri_x_test, zri_y_test))

Simple Linear Test Score: 0.927
Lasso Test Score: -0.010
Ridge Test Score: 0.925
Gradient Regressor Test Score: 0.924


### Kaggle Score

In [246]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(zri_model_ols.predict(zri_x_test), zri_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(zri_model_lasso.predict(zri_x_test), zri_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(zri_model_ridge.predict(zri_x_test), zri_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(zri_model_gradient.predict(zri_x_test), zri_y_test)))

Simple Linear Kaggle Score: 0.09745
Lasso Kaggle Score: 0.36156
Ridge Kaggle Score: 0.09837
Gradient Regressor Kaggle Score: 0.09928


# Check the score if we take more recent Zillow Rent Index (2016,2017,2018 to predict 2019)

In [247]:
ultimate_x_train = zillow_mean.iloc[:,4:7]
ultimate_y_train = zillow_mean.iloc[:,7]
print(ultimate_x_train.head())

ultimate_x_test = zillow_mean.iloc[:,5:8]
ultimate_y_test = zillow_mean.iloc[:,8]
print(ultimate_x_test.head())

          zri_2015     zri_2016     zri_2017
zip                                         
60657  2073.500000  2140.416667  2127.416667
77494  1921.916667  1819.833333  1770.083333
60614  2271.000000  2346.500000  2344.000000
77084  1417.166667  1414.083333  1372.750000
79936  1014.833333  1011.916667   992.583333
          zri_2016     zri_2017     zri_2018
zip                                         
60657  2140.416667  2127.416667  2098.333333
77494  1819.833333  1770.083333  1788.166667
60614  2346.500000  2344.000000  2314.250000
77084  1414.083333  1372.750000  1414.416667
79936  1011.916667   992.583333   973.916667


In [248]:
ultimate_x_train = np.log(ultimate_x_train+1)
ultimate_y_train = np.log(ultimate_y_train)

ultimate_x_test = np.log(ultimate_x_test+1)
ultimate_y_test = np.log(ultimate_y_test) 

In [249]:
ultimate_model_ols = LinearRegression()
ultimate_model_lasso = Lasso()
ultimate_model_ridge = Ridge()
ultimate_model_gradient = GradientBoostingRegressor()

ultimate_model_ols.fit(ultimate_x_train, ultimate_y_train)
ultimate_model_lasso.fit(ultimate_x_train, ultimate_y_train)
ultimate_model_ridge.fit(ultimate_x_train, ultimate_y_train)
ultimate_model_gradient.fit(ultimate_x_train, ultimate_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [250]:
print("Simple Linear Train Score: %.4f" %ultimate_model_ols.score(ultimate_x_train, ultimate_y_train))
print("Lasso Train Score: %.4f" %ultimate_model_lasso.score(ultimate_x_train, ultimate_y_train))
print("Ridge Train Score: %.4f" %ultimate_model_ridge.score(ultimate_x_train, ultimate_y_train))
print("Gradient Regressor Train Score: %.4f" %ultimate_model_gradient.score(ultimate_x_train, ultimate_y_train))

Simple Linear Train Score: 0.9941
Lasso Train Score: 0.0000
Ridge Train Score: 0.9940
Gradient Regressor Train Score: 0.9946


### Test Score

In [251]:
print("Simple Linear Test Score: %.4f" %ultimate_model_ols.score(ultimate_x_test, ultimate_y_test))
print("Lasso Test Score: %.4f" %ultimate_model_lasso.score(ultimate_x_test, ultimate_y_test))
print("Ridge Test Score: %.4f" %ultimate_model_ridge.score(ultimate_x_test, ultimate_y_test))
print("Gradient Regressor Test Score: %.4f" %ultimate_model_gradient.score(ultimate_x_test, ultimate_y_test))

Simple Linear Test Score: 0.9924
Lasso Test Score: -0.0007
Ridge Test Score: 0.9922
Gradient Regressor Test Score: 0.9919


### Kaggle Score

In [252]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(ultimate_model_ols.predict(ultimate_x_test), ultimate_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(ultimate_model_lasso.predict(ultimate_x_test), ultimate_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(ultimate_model_ridge.predict(ultimate_x_test), ultimate_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(ultimate_model_gradient.predict(ultimate_x_test), ultimate_y_test)))



Simple Linear Kaggle Score: 0.03147
Lasso Kaggle Score: 0.35994
Ridge Kaggle Score: 0.03184
Gradient Regressor Kaggle Score: 0.03247


### Prediction Accuracy on Zip Code Level

In [253]:
ultimate_predictions = pd.Series(np.exp(ultimate_model_ols.predict(ultimate_x_test)), index = ultimate_y_test.index)


In [254]:
(1 - np.abs((zillow_mean.iloc[:, 8]/ultimate_predictions).dropna()-1)).mean()*100

97.66216933514417

## Turn New York Results Back into Monthly
## Check accuracy for each zip code

In [255]:
zillow_monthly.head()

Unnamed: 0_level_0,2011-01,2011-02,2011-03,2011-04,2011-05,2011-06,2011-07,2011-08,2011-09,2011-10,...,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60657,1755.0,1742.0,1735.0,1741.0,1742.0,1757.0,1790.0,1817.0,1839.0,1841.0,...,2017.0,2039.0,2070.0,2105.0,2140.0,2168.0,2185.0,2125.0,2089.0,2053.0
77494,1542.0,1547.0,1559.0,1578.0,1591.0,1597.0,1604.0,1609.0,1607.0,1598.0,...,1769.0,1765.0,1755.0,1751.0,1752.0,1754.0,1759.0,1764.0,1769.0,1776.0
60614,1870.0,1841.0,1825.0,1823.0,1829.0,1850.0,1894.0,1937.0,1966.0,1977.0,...,2213.0,2245.0,2289.0,2332.0,2372.0,2398.0,2412.0,2348.0,2356.0,2311.0
77084,993.0,1015.0,1043.0,1060.0,1070.0,1072.0,1078.0,1082.0,1088.0,1092.0,...,1393.0,1389.0,1389.0,1392.0,1396.0,1401.0,1408.0,1421.0,1419.0,1436.0
79936,1071.0,1045.0,1019.0,996.0,980.0,976.0,977.0,979.0,981.0,986.0,...,943.0,938.0,936.0,938.0,940.0,943.0,944.0,949.0,962.0,977.0


In [256]:
nyc_zip = pd.read_csv('zip_borough.csv').set_index('zip')

real_values = pd.concat([nyc_zip, zillow_monthly], join = 'inner', axis = 1).iloc[:,-12:]
real_values.head()

Unnamed: 0_level_0,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10004,3869.0,3879.0,3892.0,3906.0,3937.0,3958.0,3963.0,3993.0,3999.0,4001.0,4053.0,4074.0
10005,3918.0,3926.0,3929.0,3934.0,3941.0,3950.0,3961.0,3972.0,3981.0,3979.0,4014.0,4033.0
10006,3682.0,3691.0,3689.0,3701.0,3722.0,3750.0,3786.0,3809.0,3824.0,3791.0,3752.0,3825.0
10018,3529.0,3529.0,3549.0,3572.0,3584.0,3608.0,3637.0,3642.0,3634.0,3593.0,3620.0,3567.0
10035,2760.0,2774.0,2754.0,2747.0,2735.0,2723.0,2714.0,2718.0,2724.0,2660.0,2679.0,2686.0


In [257]:
real_mean = pd.concat([nyc_zip, zillow_mean], join = 'inner', axis = 1).iloc[:,9]
real_mean

zip
10004    3960.333333
10005    3961.500000
10006    3751.833333
10018    3588.666667
10035    2722.833333
            ...     
11435    2011.666667
11436    2219.250000
11691    2165.250000
11692    2178.333333
11693    2139.291667
Name: zri_2019, Length: 124, dtype: float64

In [258]:
predicted_mean = pd.Series(np.exp(model_ols.predict(x_test_2019)),index = y_test_2019.index)
predicted_mean = pd.concat([nyc_zip, predicted_mean], join = 'inner', axis = 1).iloc[:,1]
predicted_mean

zip
10004    3580.787512
10005    3756.303237
10018    3511.030817
10035    2965.391833
10301    2032.756492
            ...     
11434    2253.099410
11435    2105.227033
11436    2295.577935
11691    2244.908698
11692    2239.027303
Name: 0, Length: 119, dtype: float64

In [259]:
real_values = pd.concat([real_values, predicted_mean], join = 'inner', axis = 1).iloc[:,:-1]

In [260]:
real_values.apply(lambda x: x/x.mean(), axis = 1).iloc[0,:]*real_mean.iloc[0,]

2019-01    3869.0
2019-02    3879.0
2019-03    3892.0
2019-04    3906.0
2019-05    3937.0
2019-06    3958.0
2019-07    3963.0
2019-08    3993.0
2019-09    3999.0
2019-10    4001.0
2019-11    4053.0
2019-12    4074.0
Name: 10004, dtype: float64

In [261]:
predicted_monthly = real_values.copy()

for i in range(real_values.shape[0]):
    predicted_monthly.iloc[i,:] = real_values.apply(lambda x: x/x.mean(), axis = 1).iloc[i,:]*predicted_mean.iloc[i,]
    
    

In [262]:
predicted_monthly.head()

Unnamed: 0_level_0,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10004,3498.207276,3507.248908,3519.003029,3531.661313,3559.690371,3578.677798,3583.198614,3610.323509,3615.748488,3617.556814,3664.573298,3683.560725
10005,3715.056439,3722.642057,3725.486664,3730.227675,3736.865091,3745.398912,3755.829136,3766.259361,3774.793181,3772.896777,3806.083856,3824.099699
10018,3452.654956,3452.654956,3472.222284,3494.724711,3506.465107,3529.9459,3558.318526,3563.210357,3555.383426,3515.270405,3541.686297,3489.832879
10035,3005.869422,3021.116586,2999.334924,2991.711342,2978.642344,2965.573347,2955.771599,2960.127931,2966.66243,2896.96111,2917.653689,2925.277271
10301,1986.848256,2006.91743,2015.948559,2013.941641,2008.924348,2008.924348,2017.955476,2038.02465,2059.097283,2052.073072,2073.145705,2111.277136


In [263]:
real_values.head()

Unnamed: 0_level_0,2019-01,2019-02,2019-03,2019-04,2019-05,2019-06,2019-07,2019-08,2019-09,2019-10,2019-11,2019-12
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
10004,3869.0,3879.0,3892.0,3906.0,3937.0,3958.0,3963.0,3993.0,3999.0,4001.0,4053.0,4074.0
10005,3918.0,3926.0,3929.0,3934.0,3941.0,3950.0,3961.0,3972.0,3981.0,3979.0,4014.0,4033.0
10018,3529.0,3529.0,3549.0,3572.0,3584.0,3608.0,3637.0,3642.0,3634.0,3593.0,3620.0,3567.0
10035,2760.0,2774.0,2754.0,2747.0,2735.0,2723.0,2714.0,2718.0,2724.0,2660.0,2679.0,2686.0
10301,1980.0,2000.0,2009.0,2007.0,2002.0,2002.0,2011.0,2031.0,2052.0,2045.0,2066.0,2104.0


### Prediction Accuracy  for each zipcode

In [264]:
round(pd.DataFrame(1 - np.abs((((real_mean/predicted_mean).dropna()-1))), columns = ['Accuracy %'])*100, 2)


Unnamed: 0_level_0,Accuracy %
zip,Unnamed: 1_level_1
10004,89.40
10005,94.54
10018,97.79
10035,91.82
10301,99.66
...,...
11434,98.39
11435,95.56
11436,96.68
11691,96.45


### Average Monthly Prediciton Accuracy for New York Zip Codes (%)

In [265]:
(1 - np.abs((((real_mean/predicted_mean).dropna()-1)))).mean()*100


97.17051631635708

### Median Monthly Prediction Accuracy

In [266]:
(1 - np.abs((((real_mean/predicted_mean).dropna()-1)))).median()*100

97.13189485163232

## Check prediction Accuracy for Entire Dataset
## On Individual Zip Code Level

In [267]:
predicted_all = pd.Series(np.exp(model_ols.predict(x_test_2019)),index = y_test_2019.index)

In [268]:
(1 - np.abs((zillow_mean.iloc[:,-1]/predicted_all).dropna()-1))*100

zip
1001     99.982725
1007     97.548709
1013     95.792145
1020     98.606227
1028     98.899665
           ...    
99567    96.936665
99577    96.446087
99645    95.505453
99654    96.438357
99705    99.810261
Length: 8044, dtype: float64

### Average Monthly Prediciton Accuracy

In [269]:
((1 - np.abs((zillow_mean.iloc[:,-1]/predicted_all).dropna()-1))*100).mean()

97.42427190419566

# Use Only Top 10 most important features

In [270]:
feat_import = sorted(list(zip(x_train.columns,model_gradient.feature_importances_)), key = lambda x: x[1],reverse = True)[0:10]



In [271]:
top_features = [i[0] for i in feat_import]
top_features

['zri_2015',
 'owner_occupied_housing_units_upper_value_quartile_2011',
 'zri_2014',
 'median_rent_2011',
 'owner_occupied_housing_units_median_value_2011',
 'median_rent_2013',
 'median_rent_2012',
 'percent_income_spent_on_rent_2013',
 'zri_2013',
 'median_income_2011']

In [272]:
top_x_train = x_train[top_features]
top_y_train = y_train

In [273]:
new_top = []

for i in top_features:
    i = re.sub('2015','2018',i)
    i = re.sub('2014','2017',i)
    i = re.sub('2013','2016',i)
    i = re.sub('2012','2015',i)
    i = re.sub('2011','2014',i)
    new_top.append(i)

new_top

['zri_2018',
 'owner_occupied_housing_units_upper_value_quartile_2014',
 'zri_2017',
 'median_rent_2014',
 'owner_occupied_housing_units_median_value_2014',
 'median_rent_2016',
 'median_rent_2015',
 'percent_income_spent_on_rent_2016',
 'zri_2016',
 'median_income_2014']

In [274]:
top_x_test = x_test_2019[new_top]
top_y_test = y_test_2019

### Already Logged

In [275]:
top_model_ols = LinearRegression()
top_model_lasso = Lasso()
top_model_ridge = Ridge()
top_model_gradient = GradientBoostingRegressor()

top_model_ols.fit(top_x_train, top_y_train)
top_model_lasso.fit(top_x_train, top_y_train)
top_model_ridge.fit(top_x_train, top_y_train)
top_model_gradient.fit(top_x_train, top_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Scores

In [276]:
print("Simple Linear Train Score: %.4f" %top_model_ols.score(top_x_train, top_y_train))
print("Lasso Train Score: %.4f" %top_model_lasso.score(top_x_train, top_y_train))
print("Ridge Train Score: %.4f" %top_model_ridge.score(top_x_train, top_y_train))
print("Gradient Regressor Train Score: %.4f" %top_model_gradient.score(top_x_train, top_y_train))

Simple Linear Train Score: 0.9935
Lasso Train Score: 0.0000
Ridge Train Score: 0.9934
Gradient Regressor Train Score: 0.9942


### Test Scores

In [277]:

print('Simple Linear Test Score: %.4f'%top_model_ols.score(top_x_test, top_y_test))
print('Lasso Test Score: %.4f'%top_model_lasso.score(top_x_test, top_y_test))
print('Ridge Test Score: %.4f'%top_model_ridge.score(top_x_test, top_y_test))
print('Gradient Regressor Test Score: %.4f'%top_model_gradient.score(top_x_test, top_y_test))

Simple Linear Test Score: 0.9922
Lasso Test Score: -0.0094
Ridge Test Score: 0.9915
Gradient Regressor Test Score: 0.9899


### Kaggle Scores

In [278]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(top_model_ols.predict(top_x_test), top_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(top_model_lasso.predict(top_x_test), top_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(top_model_ridge.predict(top_x_test), top_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(top_model_gradient.predict(top_x_test), top_y_test)))


Simple Linear Kaggle Score: 0.03172
Lasso Kaggle Score: 0.36112
Ridge Kaggle Score: 0.03323
Gradient Regressor Kaggle Score: 0.03619


### Average Prediction Accuracy (Mean of ZRI)

In [279]:
top_predictions = pd.Series(np.exp(top_model_ols.predict(top_x_test)), index = top_y_test.index)

In [280]:
(1 - np.abs((zillow_mean.iloc[:, 8]/top_predictions).dropna()-1))

zip
1001     0.992872
1007     0.981271
1013     0.958552
1020     0.995817
1028     0.971192
           ...   
99567    0.962697
99577    0.958447
99645    0.965252
99654    0.972182
99705    0.999911
Length: 8044, dtype: float64

In [281]:
(1 - np.abs((zillow_mean.iloc[:, 8]/top_predictions).dropna()-1)).mean()*100

97.6195257831968

# Train on 2013, 2014, 2015 and predict 2018 (top 10 features)
## Test on  2014, 2015, 2016 and predict 2019

In [282]:
#top features for 2011,2012,2013 to predict 2016 model
top_features

['zri_2015',
 'owner_occupied_housing_units_upper_value_quartile_2011',
 'zri_2014',
 'median_rent_2011',
 'owner_occupied_housing_units_median_value_2011',
 'median_rent_2013',
 'median_rent_2012',
 'percent_income_spent_on_rent_2013',
 'zri_2013',
 'median_income_2011']

### Change them to features from 2013,2014, 2015 (and zri 2015,2016,2017)

In [283]:

updated_top = []

for i in top_features:
    i = re.sub('2015','2017',i)
    i = re.sub('2014','2016',i)
    i = re.sub('2013','2015',i)
    i = re.sub('2012','2014',i)
    i = re.sub('2011','2013',i)
    updated_top.append(i)

updated_top

['zri_2017',
 'owner_occupied_housing_units_upper_value_quartile_2013',
 'zri_2016',
 'median_rent_2013',
 'owner_occupied_housing_units_median_value_2013',
 'median_rent_2015',
 'median_rent_2014',
 'percent_income_spent_on_rent_2015',
 'zri_2015',
 'median_income_2013']

In [284]:
super_x_train = total[updated_top]
super_x_test = total[new_top]

In [285]:
super_y_train = total.iloc[:,7]
super_y_test = total.iloc[:,8]

### Log Everything

In [286]:
super_x_train = np.log(super_x_train+1)
super_x_test = np.log(super_x_test+1)
super_y_train = np.log(super_y_train)
super_y_test = np.log(super_y_test)

In [287]:
super_model_ols = LinearRegression()
super_model_lasso = Lasso()
super_model_ridge = Ridge()
super_model_gradient = GradientBoostingRegressor()

super_model_ols.fit(super_x_train, super_y_train)
super_model_lasso.fit(super_x_train, super_y_train)
super_model_ridge.fit(super_x_train, super_y_train)
super_model_gradient.fit(super_x_train, super_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [288]:
print("Simple Regression Train Score is: %.4f" %super_model_ols.score(super_x_train, super_y_train))
print("Lasso Train Score is: %.4f" %super_model_lasso.score(super_x_train, super_y_train))
print("Ridge Train Score is: %.4f" %super_model_ridge.score(super_x_train, super_y_train))
print("Gradient Regressor Train Score is: %.4f" %super_model_gradient.score(super_x_train, super_y_train))

Simple Regression Train Score is: 0.9945
Lasso Train Score is: 0.0000
Ridge Train Score is: 0.9944
Gradient Regressor Train Score is: 0.9952


### Test Score

In [289]:
print("Simple Regression Test Score is: %.4f" %super_model_ols.score(super_x_test, super_y_test))
print("Lasso Test Score is: %.4f" %super_model_lasso.score(super_x_test, super_y_test))
print("Ridge Test Score is: %.4f" %super_model_ridge.score(super_x_test, super_y_test))
print("Gradient Regressor Test Score is: %.4f" %super_model_gradient.score(super_x_test, super_y_test))

Simple Regression Test Score is: 0.9927
Lasso Test Score is: -0.0007
Ridge Test Score is: 0.9926
Gradient Regressor Test Score is: 0.9922


### Kaggle Score

In [290]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(super_model_ols.predict(super_x_test), super_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(super_model_lasso.predict(super_x_test), super_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(super_model_ridge.predict(super_x_test), super_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(super_model_gradient.predict(super_x_test), super_y_test)))


Simple Linear Kaggle Score: 0.03062
Lasso Kaggle Score: 0.35957
Ridge Kaggle Score: 0.03102
Gradient Regressor Kaggle Score: 0.03166


### Average Prediction Accuracy (Mean of ZRI)

In [291]:
super_predictions = pd.Series(np.exp(super_model_ols.predict(super_x_test)), index = super_y_test.index)



In [292]:
(1 - np.abs((zillow_mean.iloc[:, 8]/super_predictions).dropna()-1)).mean()*100

97.71347051921157

# Adding IRS Data

In [293]:
irs2011 = pd.read_csv('11zpallagi.csv')
irs2012 = pd.read_csv('12zpallagi.csv')
irs2013 = pd.read_csv('13zpallagi.csv')
irs2014 = pd.read_csv('14zpallagi.csv')
irs2015 = pd.read_csv('15zpallagi.csv')
irs2016 = pd.read_csv('16zpallagi.csv')
irs2017 = pd.read_csv('17zpallagi.csv')

In [294]:
irs_mutual_columns = set(irs2013.columns).\
intersection(set(irs2014.columns)).intersection(set(irs2015.columns)).intersection(set(irs2016.columns)).\
intersection(set(irs2017.columns))

In [295]:
irs2013 = irs2013[irs_mutual_columns].drop(['STATEFIPS','STATE'], axis = 1)
irs2014 = irs2014[irs_mutual_columns].drop(['STATEFIPS','STATE'], axis = 1)
irs2015 = irs2015[irs_mutual_columns].drop(['STATEFIPS','STATE'], axis = 1)
irs2016 = irs2016[irs_mutual_columns].drop(['STATEFIPS','STATE'], axis = 1)
irs2017 = irs2017[irs_mutual_columns].drop(['STATEFIPS','STATE'], axis = 1)

In [296]:
irs2017.columns = list(map(lambda x: x + '_2017', irs2017.columns))

In [297]:
irs2017.columns = irs2017.columns.str.replace('zipcode_2017', 'zipcode')

## Each zipcode has 6 lines for 6 income brackets

## We add all the values to have total values

In [298]:
irs2013 = irs2013.groupby('zipcode').sum().reset_index()
irs2014 = irs2014.groupby('zipcode').sum().reset_index()
irs2015 = irs2015.groupby('zipcode').sum().reset_index()
irs2016 = irs2016.groupby('zipcode').sum().reset_index()
irs2017 = irs2017.groupby('zipcode').sum().reset_index()

In [299]:
IRS = irs2013.merge(irs2014, on = 'zipcode', suffixes = ('_2013','_2014')).\
merge(irs2015, on = 'zipcode').merge(irs2016, on = 'zipcode', suffixes = ('_2015','_2016')).\
merge(irs2017, on = 'zipcode').set_index('zipcode')

In [300]:
IRS.head()

Unnamed: 0_level_0,N00300_2013,PREP_2013,A01400_2013,N02650_2013,N18300_2013,N07300_2013,A03220_2013,N03300_2013,A19700_2013,agi_stub_2013,...,A18300_2017,A07100_2017,N03150_2017,A59660_2017,A18450_2017,A03230_2017,A04470_2017,A10300_2017,A02900_2017,A02650_2017
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,43523970.0,79049580.0,207388781.0,143724750.0,44050210.0,7010520.0,914929.0,808490.0,179233983.0,1071,...,615964939.0,61751724.0,2404180.0,64981097.0,19486012.0,1999524.0,1374069000.0,1664300000.0,151010505.0,11219490000.0
1001,3380.0,4970.0,12310.0,8780.0,3190.0,420.0,79.0,30.0,5028.0,21,...,26808.0,2778.0,160.0,1872.0,139.0,99.0,58309.0,61395.0,5987.0,535877.0
1002,4730.0,4750.0,19342.0,9570.0,3950.0,1290.0,111.0,230.0,15619.0,21,...,67002.0,4637.0,240.0,1711.0,248.0,359.0,118096.0,132658.0,18171.0,872920.0
1005,900.0,1230.0,3497.0,2230.0,840.0,80.0,32.0,0.0,1133.0,21,...,7750.0,922.0,40.0,489.0,45.0,0.0,16356.0,15507.0,1686.0,143893.0
1007,3630.0,3810.0,7883.0,7300.0,3180.0,420.0,98.0,60.0,4269.0,21,...,38217.0,3355.0,170.0,1053.0,102.0,172.0,71582.0,72788.0,7930.0,573583.0


In [301]:
IRS_total = pd.concat([zillow_mean,IRS], join = 'inner', axis = 1)

# Train 2013, 2014, 2015 to predict 2018
## Test 2014, 2015, 2016 to predict 2019 

In [302]:
irs_x_train = pd.concat([IRS_total.iloc[:,4:7],IRS_total.iloc[:,9:333]], join = 'inner', axis = 1)
irs_x_test = pd.concat([IRS_total.iloc[:,5:8],IRS_total.iloc[:,117:441]], join = 'inner', axis = 1)

In [303]:
irs_y_train = IRS_total.iloc[:,7]
irs_y_test = IRS_total.iloc[:,8]

In [304]:
good = list(set(irs_x_train[(irs_x_train < 0).sum(axis = 1) == 0].index).intersection(set(irs_x_test[(irs_x_test < 0).sum(axis = 1) == 0].index)))

    
len(good)
    

8609

In [305]:
irs_x_train = irs_x_train.loc[good,]
irs_x_test = irs_x_test.loc[good,]
irs_y_train = irs_y_train[good]
irs_y_test = irs_y_test[good]

### Log Everything

In [306]:
irs_x_train = np.log(irs_x_train + 1) 
irs_x_test = np.log(irs_x_test + 1)
irs_y_train = np.log(irs_y_train)
irs_y_test = np.log(irs_y_test)

### Modelling

In [307]:
irs_model_ols = LinearRegression()
irs_model_lasso = Lasso()
irs_model_ridge = Ridge()
irs_model_gradient = GradientBoostingRegressor()

irs_model_ols.fit(irs_x_train, irs_y_train)
irs_model_lasso.fit(irs_x_train, irs_y_train)
irs_model_ridge.fit(irs_x_train, irs_y_train)
irs_model_gradient.fit(irs_x_train, irs_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [308]:
print("Simple Linear Train Score %.4f" %irs_model_ols.score(irs_x_train, irs_y_train))
print("Lasso Train Score %.4f" %irs_model_lasso.score(irs_x_train, irs_y_train))
print("Ridge Train Score %.4f" %irs_model_ridge.score(irs_x_train, irs_y_train))
print("Gradient Regressor Train Score %.4f" %irs_model_gradient.score(irs_x_train, irs_y_train))

Simple Linear Train Score 0.9951
Lasso Train Score 0.0000
Ridge Train Score 0.9949
Gradient Regressor Train Score 0.9951


### Test Score

In [309]:
print("Simple Linear Test Score %.4f" %irs_model_ols.score(irs_x_test, irs_y_test))
print("Lasso Test Score %.4f" %irs_model_lasso.score(irs_x_test, irs_y_test))
print("Ridge Tset Score %.4f" %irs_model_ridge.score(irs_x_test, irs_y_test))
print("Gradient Regressor Test Score %.4f" %irs_model_gradient.score(irs_x_test, irs_y_test))

Simple Linear Test Score 0.9920
Lasso Test Score -0.0007
Ridge Tset Score 0.9918
Gradient Regressor Test Score 0.9920


### Kaggle Score

In [310]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(irs_model_ols.predict(irs_x_test), irs_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(irs_model_lasso.predict(irs_x_test), irs_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(irs_model_ridge.predict(irs_x_test), irs_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(irs_model_gradient.predict(irs_x_test), irs_y_test)))


Simple Linear Kaggle Score: 0.03211
Lasso Kaggle Score: 0.35992
Ridge Kaggle Score: 0.03249
Gradient Regressor Kaggle Score: 0.03209


### Accuracy on Individual Zip Code Level

In [311]:
irs_pred = pd.Series(np.exp(irs_model_gradient.predict(irs_x_test)), index = irs_y_test.index)

In [312]:
(1 - np.abs((((zillow_mean.iloc[:,8]/irs_pred).dropna()-1)))).mean()*100

97.60823282393247

## Most important features top 20


In [313]:
sorted(list(zip(irs_x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[:20]

[('zri_2017', 0.9948638707055147),
 ('N00300_2015', 0.0015618771865639365),
 ('zri_2016', 0.0006057610455675675),
 ('N03230_2014', 0.0005570436492386293),
 ('A02900_2013', 0.00032814561426754043),
 ('zri_2015', 5.6846575475015904e-05),
 ('A10600_2013', 5.23907287779398e-05),
 ('N01000_2014', 5.002187004812972e-05),
 ('A03230_2014', 3.951963863258817e-05),
 ('N05800_2014', 3.59644138298343e-05),
 ('A59720_2014', 3.1988812382353485e-05),
 ('N11902_2013', 2.9518603238510594e-05),
 ('A11902_2015', 1.5269406535607972e-05),
 ('A07180_2014', 1.4867773773053184e-05),
 ('N09600_2013', 1.4741614384466008e-05),
 ('N07230_2013', 1.3807294342471386e-05),
 ('N26270_2014', 1.3478398364001393e-05),
 ('N06500_2015', 1.2914974604256662e-05),
 ('A04800_2014', 9.94740228530818e-06),
 ('N10300_2015', 9.105441956929732e-06)]

# Train model again with 20 top feature

## Train 2013, 2014, 2015 to predict 2018
## Test 2014, 2015, 2016 to predict 2019 

In [314]:
irs_top = [i[0] for i in sorted(list(zip(irs_x_train,model_gradient.feature_importances_)), key = lambda x: x[1], reverse = True)[:20]]


In [315]:
irs_new_top = []

for i in irs_top:
    i = re.sub('2017','2018',i)
    i = re.sub('2016','2017',i)
    i = re.sub('2015','2016',i)
    i = re.sub('2014','2015',i)
    i = re.sub('2013','2014',i)
    irs_new_top.append(i)

irs_new_top

['zri_2018',
 'N00300_2016',
 'zri_2017',
 'N03230_2015',
 'A02900_2014',
 'zri_2016',
 'A10600_2014',
 'N01000_2015',
 'A03230_2015',
 'N05800_2015',
 'A59720_2015',
 'N11902_2014',
 'A11902_2016',
 'A07180_2015',
 'N09600_2014',
 'N07230_2014',
 'N26270_2015',
 'N06500_2016',
 'A04800_2015',
 'N10300_2016']

In [316]:
top_irs_x_train = irs_x_train[irs_top]
top_irs_x_test = irs_x_test[irs_new_top]

In [317]:
top_irs_y_train = irs_y_train.copy()
top_irs_y_test = irs_y_test.copy()

In [318]:
top_irs_model_ols = LinearRegression()
top_irs_model_lasso = Lasso()
top_irs_model_ridge = Ridge()
top_irs_model_gradient = GradientBoostingRegressor()

top_irs_model_ols.fit(top_irs_x_train, top_irs_y_train)
top_irs_model_lasso.fit(top_irs_x_train, top_irs_y_train)
top_irs_model_ridge.fit(top_irs_x_train, top_irs_y_train)
top_irs_model_gradient.fit(top_irs_x_train, top_irs_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [319]:
print("Simple Linear Train Score %.4f" %top_irs_model_ols.score(top_irs_x_train, top_irs_y_train))
print("Lasso Train Score %.4f" %top_irs_model_lasso.score(top_irs_x_train, top_irs_y_train))
print("Ridge Train Score %.4f" %top_irs_model_ridge.score(top_irs_x_train, top_irs_y_train))
print("Gradient Regressor Train Score %.4f" %top_irs_model_gradient.score(top_irs_x_train, top_irs_y_train))

Simple Linear Train Score 0.9942
Lasso Train Score 0.0000
Ridge Train Score 0.9940
Gradient Regressor Train Score 0.9947


### Test Score

In [320]:
print("Simple Linear Test Score %.4f" %top_irs_model_ols.score(top_irs_x_test, top_irs_y_test))
print("Lasso Test Score %.4f" %top_irs_model_lasso.score(top_irs_x_test, top_irs_y_test))
print("Ridge Test Score %.4f" %top_irs_model_ridge.score(top_irs_x_test, top_irs_y_test))
print("Gradient Regressor Test Score %.4f" %top_irs_model_gradient.score(top_irs_x_test, top_irs_y_test))

Simple Linear Test Score 0.9924
Lasso Test Score -0.0007
Ridge Test Score 0.9922
Gradient Regressor Test Score 0.9919


### Kaggle Score

In [321]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(top_irs_model_ols.predict(top_irs_x_test), top_irs_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(top_irs_model_lasso.predict(top_irs_x_test), top_irs_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(top_irs_model_ridge.predict(top_irs_x_test), top_irs_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(top_irs_model_gradient.predict(top_irs_x_test), top_irs_y_test)))


Simple Linear Kaggle Score: 0.03131
Lasso Kaggle Score: 0.35992
Ridge Kaggle Score: 0.03170
Gradient Regressor Kaggle Score: 0.03242


### Average Prediction Accuracy on Zip Code Level

In [322]:
top_irs_pred = pd.Series(np.exp(top_irs_model_ols.predict(top_irs_x_test)), index = top_irs_y_test.index)

In [323]:
(1 - np.abs((((zillow_mean.iloc[:,8]/top_irs_pred).dropna()-1)))).mean()*100

97.67931067665997

# Prediction Accuracy for the average of Census and IRS

In [324]:
ensemble_pred = pd.concat([super_predictions, top_irs_pred], join = 'inner', axis = 1).apply(np.mean, axis = 1)

In [325]:
ensemble_real_pred = zillow_mean.iloc[:,8][pd.concat([super_predictions, top_irs_pred], join = 'inner', axis = 1).apply(np.mean, axis = 1).index]

In [326]:
(1 - np.abs((((ensemble_real_pred/ensemble_pred).dropna()-1)))).mean()*100

97.73114730561025

In [327]:
sqrt(mean_squared_error(ensemble_pred, ensemble_real_pred))

47.40554807945078

## Kaggle Score for Average of Census Predictions and IRS predictions

In [328]:
sqrt(mean_squared_error(np.log(ensemble_pred),np.log(ensemble_real_pred)))

0.030486831621902008

## Second Level Model

In [329]:
second_pred = pd.concat([super_predictions, top_irs_pred], join = 'inner', axis = 1)
second_pred.columns = ['census','irs']

In [330]:
second_total = pd.concat([second_pred, ensemble_real_pred], join ='inner', axis = 1)
second_total.head()

Unnamed: 0,census,irs,zri_2019
60657,2099.718002,2087.066512,2085.833333
77494,1793.032662,1803.066136,1763.416667
60614,2313.650874,2304.574368,2306.583333
77084,1427.91569,1430.365711,1404.25
79936,985.924263,980.331482,947.833333


In [331]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(second_total.iloc[:,:2], second_total.iloc[:,2], test_size=0.20, random_state=42)

X_train = np.log(X_train)
X_test = np.log(X_test)
Y_train = np.log(Y_train)
Y_test = np.log(Y_test)

second_model_ols = LinearRegression()
second_model_ridge = Ridge()
second_model_gradient = GradientBoostingRegressor()

second_model_ols.fit(X_train, Y_train)
second_model_ridge.fit(X_train, Y_train)
second_model_gradient.fit(X_train, Y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [332]:
print(second_model_ols.score(X_train, Y_train))
print(second_model_ridge.score(X_train, Y_train))
print(second_model_gradient.score(X_train, Y_train))

0.9930849865351372
0.9930749312454448
0.9939108379023943


### Test Score

In [333]:
print(second_model_ols.score(X_test, Y_test))
print(second_model_ridge.score(X_test, Y_test))
print(second_model_gradient.score(X_test, Y_test))

0.992965212556897
0.9929688256524375
0.9927442259674047


### Kaggle Score

In [334]:
print(sqrt(mean_squared_error(second_model_ols.predict(X_test), Y_test)))
print(sqrt(mean_squared_error(second_model_ridge.predict(X_test), Y_test)))
print(sqrt(mean_squared_error(second_model_gradient.predict(X_test), Y_test)))

0.030167967981251573
0.030160219790511297
0.030638143298960804


### Accuracy

In [335]:
(1 - np.abs(np.exp(Y_test)/pd.Series(np.exp(second_model_ols.predict(X_test)), index = Y_test.index) - 1).mean())*100

97.7998369731858

# Check Accuracy Just for New York City Zip Codes

In [336]:
nyc_ensemble_pred = pd.concat([nyc_zip,ensemble_pred],join = 'inner',axis = 1).drop(['borough'], axis = 1)

In [337]:
nyc_ensemble_real = pd.concat([nyc_zip,ensemble_real_pred],join = 'inner',axis = 1).drop(['borough'], axis = 1)

In [338]:
nyc_ensemble_pred.columns = ['zri_2019']

In [339]:
(1 - np.abs((((nyc_ensemble_real.zri_2019/nyc_ensemble_pred.zri_2019).dropna()-1)))).mean()*100

98.10369832366912

# Train New Models with Data From both Census and IRS
## top 10 features from Census and top 20 from IRS

In [340]:
super_x_train.head() #best Census Features

Unnamed: 0_level_0,zri_2017,owner_occupied_housing_units_upper_value_quartile_2013,zri_2016,median_rent_2013,owner_occupied_housing_units_median_value_2013,median_rent_2015,median_rent_2014,percent_income_spent_on_rent_2015,zri_2015,median_income_2013
zip,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
60657,7.663134,13.297326,7.669223,7.037906,12.880294,7.107425,7.079184,3.310543,7.637475,11.236987
77494,7.479347,12.769683,7.50705,7.05445,12.476485,7.134094,7.056175,3.198673,7.561598,11.747412
60614,7.760041,13.566666,7.761106,7.117206,13.100733,7.166266,7.134891,3.238678,7.728416,11.364611
77084,7.2253,11.959539,7.254944,6.761573,11.730461,6.785588,6.781058,3.328627,7.25712,11.047487
79936,6.901318,11.970357,6.920589,6.650279,11.709322,6.6995,6.690842,3.453157,6.923465,10.85948


In [341]:
top_irs_x_train.head()  #best IRS Features

Unnamed: 0,zri_2017,N00300_2015,zri_2016,N03230_2014,A02900_2013,zri_2015,A10600_2013,N01000_2014,A03230_2014,N05800_2014,A59720_2014,N11902_2013,A11902_2015,A07180_2014,N09600_2013,N07230_2013,N26270_2014,N06500_2015,A04800_2014,N10300_2015
32771,7.092504,8.407602,7.069094,5.442418,9.779906,6.990793,12.2135,8.006701,6.202536,9.740439,9.549452,9.811427,10.986682,6.563856,5.602119,7.390799,7.340187,9.656371,13.728684,9.78307
32773,7.084017,7.403061,7.057468,4.394449,8.611958,6.976504,11.120031,6.734592,5.288267,9.172742,9.221676,9.361429,10.439337,5.880533,3.433987,6.878326,6.111467,9.047939,12.602608,9.197356
98310,7.30418,7.804251,7.250872,4.26268,8.234034,7.21297,10.874039,7.03966,5.003946,8.906664,8.091015,8.902592,9.807527,5.075174,0.0,6.154858,5.442418,8.866582,12.567607,8.905309
98311,7.350516,8.253488,7.298614,4.875197,8.730367,7.233155,11.304695,7.359468,5.587249,9.192278,8.054205,9.12598,10.109119,5.631212,3.713572,6.769642,5.831882,9.15599,12.998549,9.192278
98312,7.294547,8.266421,7.234418,4.875197,8.921725,7.190864,11.478738,7.534228,5.402677,9.332646,8.434464,9.281823,10.292586,5.594711,4.110874,6.685861,6.175867,9.301186,13.14505,9.339701


In [342]:
full_x_train = super_x_train.merge(top_irs_x_train, left_index= True, right_index = True, suffixes = ['','_extra']).\
drop(['zri_2015_extra','zri_2016_extra','zri_2017_extra'], axis = 1)

In [343]:
full_y_train = pd.concat([super_y_train,top_irs_y_train], join = 'inner', axis = 1).iloc[:,0]

In [344]:
full_x_test = super_x_test.merge(top_irs_x_test, left_index= True, right_index = True, suffixes = ['','_extra']).\
drop(['zri_2016_extra','zri_2017_extra','zri_2018_extra'], axis = 1)

In [345]:
full_y_test = pd.concat([super_y_test,top_irs_y_test], join = 'inner', axis = 1).iloc[:,0]

### Everything is already logged and cleaned

In [346]:
full_model_ols = LinearRegression()
full_model_lasso = Lasso()
full_model_ridge = Ridge()
full_model_gradient = GradientBoostingRegressor()


full_model_ols.fit(full_x_train, full_y_train)
full_model_lasso.fit(full_x_train, full_y_train)
full_model_ridge.fit(full_x_train, full_y_train)
full_model_gradient.fit(full_x_train, full_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Scores

In [347]:
print("Simple Linear Train Score %.4f" %full_model_ols.score(full_x_train, full_y_train))
print("Lasso Train Score %.4f" %full_model_lasso.score(full_x_train, full_y_train))
print("Ridge Train Score %.4f" %full_model_ridge.score(full_x_train, full_y_train))
print("Gradient Regressor Train Score %.4f" %full_model_gradient.score(full_x_train, full_y_train))

Simple Linear Train Score 0.9946
Lasso Train Score 0.0000
Ridge Train Score 0.9945
Gradient Regressor Train Score 0.9953


### Test Score

In [348]:
print("Simple Linear Test Score %.4f" %full_model_ols.score(full_x_test, full_y_test))
print("Lasso Test Score %.4f" %full_model_lasso.score(full_x_test, full_y_test))
print("Ridge Test Score %.4f" %full_model_ridge.score(full_x_test, full_y_test))
print("Gradient Regressor Test Score %.4f" %full_model_gradient.score(full_x_test, full_y_test))

Simple Linear Test Score 0.9928
Lasso Test Score -0.0007
Ridge Test Score 0.9926
Gradient Regressor Test Score 0.9922


### Kaggle Score

In [349]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(full_model_ols.predict(full_x_test), full_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(full_model_lasso.predict(full_x_test), full_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(full_model_ridge.predict(full_x_test), full_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(full_model_gradient.predict(full_x_test), full_y_test)))


Simple Linear Kaggle Score: 0.03042
Lasso Kaggle Score: 0.35944
Ridge Kaggle Score: 0.03085
Gradient Regressor Kaggle Score: 0.03166


### Average Prediction Accuracy

In [350]:
full_predictions = pd.Series(np.exp(full_model_ols.predict(full_x_test)), index = full_y_test.index)

In [351]:
(1 - np.abs(np.exp(full_y_test)/full_predictions.dropna()-1)).mean()*100

97.73274005052698

In [353]:
sqrt(mean_squared_error(np.exp(full_y_test),full_predictions))

47.539837848986885

## Most important features (top 10)

In [1062]:
list(zip(full_x_train,full_model_gradient.feature_importances_))[0:10]

[('zri_2017', 0.9964972298282635),
 ('owner_occupied_housing_units_upper_value_quartile_2013',
  0.00016489029873185417),
 ('median_rent_2013', 0.0009996365066169484),
 ('zri_2016', 0.0007572246541866352),
 ('owner_occupied_housing_units_median_value_2013', 0.0002071511528764383),
 ('median_rent_2015', 9.15080094538218e-05),
 ('median_rent_2014', 8.605613455256225e-05),
 ('percent_income_spent_on_rent_2015', 3.889426338914603e-05),
 ('zri_2015', 0.0006904212360972952),
 ('median_income_2013', 9.947224404335491e-05)]

## Train Model with just top 10 features

In [1214]:
top_10_full_train = [i[0] for i in list(zip(full_x_train,full_model_gradient.feature_importances_))[0:10]]

In [1215]:
top_10_full_test = []

for i in top_10_full_train:
    i = re.sub('2017','2018',i)
    i = re.sub('2016','2017',i)
    i = re.sub('2015','2016',i)
    i = re.sub('2014','2015',i)
    i = re.sub('2013','2014',i)
    top_10_full_test.append(i)

top_10_full_test

['zri_2018',
 'owner_occupied_housing_units_upper_value_quartile_2014',
 'median_rent_2014',
 'zri_2017',
 'owner_occupied_housing_units_median_value_2014',
 'median_rent_2016',
 'median_rent_2015',
 'percent_income_spent_on_rent_2016',
 'zri_2016',
 'median_income_2014']

In [1216]:
top_full_x_train = full_x_train[top_10_full_train]

In [1217]:
top_full_x_test = full_x_test[top_10_full_test]

In [1218]:
top_full_y_train = full_y_train

In [1219]:
top_full_y_test = full_y_test

In [1220]:
top_full_model_ols = LinearRegression()
top_full_model_lasso = Lasso()
top_full_model_ridge = Ridge()
top_full_model_gradient = GradientBoostingRegressor()


top_full_model_ols.fit(top_full_x_train, top_full_y_train)
top_full_model_lasso.fit(top_full_x_train, top_full_y_train)
top_full_model_ridge.fit(top_full_x_train, top_full_y_train)
top_full_model_gradient.fit(top_full_x_train, top_full_y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

### Train Score

In [1221]:
print("Simple Linear Train Score %.4f" %top_full_model_ols.score(top_full_x_train, top_full_y_train))
print("Lasso Train Score %.4f" %top_full_model_lasso.score(top_full_x_train, top_full_y_train))
print("Ridge Train Score %.4f" %top_full_model_ridge.score(top_full_x_train, top_full_y_train))
print("Gradient Train Score %.4f" %top_full_model_gradient.score(top_full_x_train, top_full_y_train))

Simple Linear Train Score 0.9945
Lasso Train Score 0.0000
Ridge Train Score 0.9944
Gradient Train Score 0.9952


### Test Score

In [1222]:
print("Simple Linear Test Score %.4f" %top_full_model_ols.score(top_full_x_test, top_full_y_test))
print("Lasso Test Score %.4f" %top_full_model_lasso.score(top_full_x_test, top_full_y_test))
print("Ridge Test Score %.4f" %top_full_model_ridge.score(top_full_x_test, top_full_y_test))
print("Gradient Test Score %.4f" %top_full_model_gradient.score(top_full_x_test, top_full_y_test))

Simple Linear Test Score 0.9928
Lasso Test Score -0.0007
Ridge Test Score 0.9926
Gradient Test Score 0.9922


### Kaggle Score

In [1223]:
print('Simple Linear Kaggle Score: %.5f' % sqrt(mean_squared_error(top_full_model_ols.predict(top_full_x_test), top_full_y_test)))
print('Lasso Kaggle Score: %.5f' % sqrt(mean_squared_error(top_full_model_lasso.predict(top_full_x_test), top_full_y_test)))
print('Ridge Kaggle Score: %.5f' % sqrt(mean_squared_error(top_full_model_ridge.predict(top_full_x_test), top_full_y_test)))
print('Gradient Regressor Kaggle Score: %.5f' % sqrt(mean_squared_error(top_full_model_gradient.predict(top_full_x_test), top_full_y_test)))


Simple Linear Kaggle Score: 0.03052
Lasso Kaggle Score: 0.35944
Ridge Kaggle Score: 0.03092
Gradient Regressor Kaggle Score: 0.03169


### Average Prediction Accuracy on Zip Code Level

In [1224]:
top_full_predictions = pd.Series(np.exp(top_full_model_ols.predict(top_full_x_test)), index = top_full_y_test.index)



In [1225]:
(1 - np.abs(np.exp(top_full_y_test)/top_full_predictions.dropna()-1)).mean()*100

97.7212230760227

# Let's Make a 2nd Level Lasso and Ridge from Simple Linear and Ridge

In [1155]:
train_full_model_pred_simple = pd.Series(full_model_ols.predict(full_x_train), index = full_y_train.index)
train_full_model_pred_ridge =pd.Series(full_model_ridge.predict(full_x_train), index = full_y_train.index)

In [1032]:
second_x_train = pd.concat([train_full_model_pred_simple,train_full_model_pred_ridge], join = 'inner',axis = 1)
second_y_train = full_y_train

In [1033]:
test_full_model_pred_simple = pd.Series(full_model_ols.predict(full_x_test), index = full_y_test.index)
test_full_model_pred_ridge =pd.Series(full_model_ridge.predict(full_x_test), index = full_y_test.index)

In [1034]:
second_x_test = pd.concat([test_full_model_pred_simple,test_full_model_pred_ridge], join = 'inner',axis = 1)
second_y_test = full_y_test

In [1043]:
from sklearn.model_selection import GridSearchCV

second_lasso = Lasso()

params = {'alpha':(0.001, 0.005, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8,0.9,0.99),\
          'fit_intercept': [True, False], 'max_iter': [5000]}

grid_search_lasso = GridSearchCV( estimator = second_lasso, param_grid = params)

grid_search_lasso.fit(second_x_train, second_y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=1000, normalize=False, positive=False,
                             precompute=False, random_state=None,
                             selection='cyclic', tol=0.0001, warm_start=False),
             iid='warn', n_jobs=None,
             param_grid={'alpha': (0.001, 0.005, 0.01, 0.1, 0.2, 0.3, 0.4, 0.5,
                                   0.6, 0.7, 0.8, 0.9, 0.99),
                         'fit_intercept': [True, False], 'max_iter': [5000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=0)

In [1047]:
grid_search_lasso.best_estimator_.score(second_x_train, second_y_train)

0.9947113580023164

In [1067]:
full_model_ols.score(full_x_train, full_y_train)

0.9947115077838292

In [1068]:
grid_search_lasso.best_estimator_.score(second_x_test, second_y_test)

0.9928502445606265

In [1069]:
full_model_ols.score(full_x_test, full_y_test)

0.9928522766888772

## 2nd Level Ridge

In [1128]:
params3 = {'alpha':np.linspace(-1,1, 1000)}
second_ridge = Ridge()

grid_search_ridge = GridSearchCV(estimator = second_ridge, param_grid = params3)

grid_search_ridge.fit(second_x_train, second_y_train)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True,
                             max_iter=None, normalize=False, random_state=None,
                             solver='auto', tol=0.001),
             iid='warn', n_jobs=None,
             param_grid={'alpha': array([-1.        , -0.997998  , -0.995996  , -0.99399399, -0.99199199,
       -0.98998999, -0.98798799, -0.98598599, -0.98398398, -0.98198198,
       -0.9799...
        0.94194194,  0.94394394,  0.94594595,  0.94794795,  0.94994995,
        0.95195195,  0.95395395,  0.95595596,  0.95795796,  0.95995996,
        0.96196196,  0.96396396,  0.96596597,  0.96796797,  0.96996997,
        0.97197197,  0.97397397,  0.97597598,  0.97797798,  0.97997998,
        0.98198198,  0.98398398,  0.98598599,  0.98798799,  0.98998999,
        0.99199199,  0.99399399,  0.995996  ,  0.997998  ,  1.        ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_sc

In [1129]:
grid_search_ridge.best_estimator_.score(second_x_train, second_y_train)

0.9947072704037324

In [1130]:
grid_search_ridge.best_estimator_.score(second_x_test, second_y_test)

0.9928348563868646

## Voting Regressor 1st Level

In [1]:
from sklearn.ensemble import VotingRegressor

VR = VotingRegressor([('ols',full_model_ols),('rg', full_model_ridge),('gr',full_model_gradient)])



NameError: name 'full_model_ols' is not defined

In [None]:
VR.fit(full_x_train, full_y_train)

In [1133]:
VR.score(full_x_train, full_y_train)

0.9950172051314589

In [1134]:
VR.score(full_x_test, full_y_test)

0.9927464250302286

In [1135]:
full_model_ols.score(full_x_test, full_y_test)

0.9928522766888772