In [1]:
import pandas as pd, numpy as np, statsmodels.api as sm
import matplotlib.pyplot as plt, matplotlib.cm as cm, matplotlib.font_manager as fm
import matplotlib.mlab as mlab
from scipy.stats import pearsonr, ttest_rel
%matplotlib inline

In [4]:
store = pd.HDFStore('data/filtered_listings.h5')
rents = store['rents']

In [53]:
rents['y17jan'] = rents['month']==1
rents['y17feb'] = rents['month']==2
rents['y17mar'] = rents['month']==3

In [54]:
rents.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,2908942.0,2908942.0,2908942.0,2908942.0,2908942.0,2908942.0,1726125.0,2908942.0,2908942.0,2908942.0,2908942.0,2908942.0
mean,1493.998,1.754201,1004.015,1.595298,-97.22085,36.71698,1.473283,25.80334,6.328678,2016.533,61445.0,4517404.0
std,862.3287,0.935379,431.9104,0.8620291,17.28631,5.212311,0.5973946,21.87013,5.013247,0.4989424,11451.5,3539324.0
min,78.0,0.0,216.0,0.06198347,-159.6241,-84.50782,0.0,1.0,1.0,2016.0,43423.0,375751.0
25%,920.0,1.0,725.0,0.9961538,-115.3218,33.25834,1.0,5.0,1.0,2016.0,52689.0,2069681.0
50%,1291.0,2.0,925.0,1.352273,-95.5607,36.2604,1.0,10.0,3.0,2017.0,60072.0,3263431.0
75%,1795.0,2.0,1158.0,1.95,-81.28679,39.8977,2.0,49.0,12.0,2017.0,66870.0,6033737.0
max,9999.0,9.0,4498.0,8.431373,2.317761,165.383,8.0,52.0,12.0,2017.0,91193.0,20092880.0


In [57]:
sfbay = rents[rents['region']=='sfbay']
sfbay.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,128379.0,128379.0,128379.0,128379.0,128379.0,128379.0,66682.0,128379.0,128379.0,128379.0,128379.0,128379.0
mean,2862.479152,1.763474,1011.005881,3.050018,-122.129394,37.636303,1.489487,28.248787,6.897577,2016.489208,80600.0,8607423.0
std,1120.002811,0.97211,451.229335,1.033937,0.863566,0.334485,0.613397,21.131186,4.877076,0.499885,0.0,0.0
min,100.0,0.0,216.0,0.075,-124.0317,23.152315,0.0,1.0,1.0,2016.0,80600.0,8607423.0
25%,2145.0,1.0,715.0,2.4,-122.328508,37.376501,1.0,8.0,2.0,2016.0,80600.0,8607423.0
50%,2600.0,2.0,922.0,2.880435,-122.072385,37.627998,1.0,46.0,11.0,2016.0,80600.0,8607423.0
75%,3321.0,2.0,1175.0,3.548896,-121.959339,37.800253,2.0,49.0,12.0,2017.0,80600.0,8607423.0
max,9995.0,8.0,4440.0,8.348018,2.317761,48.887051,6.0,52.0,12.0,2017.0,80600.0,8607423.0


In [58]:
dset = sfbay
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.51446945337620598, 7.2709163346613499]
valid rent range: [740.0, 8750.0]
valid sqft range: [266.0, 3500.0]


In [59]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
sfbay_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(sfbay_filtered)

127046

In [60]:
import statsmodels.api as sm
import numpy as np
from patsy import dmatrices
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=sfbay_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.349
Model:                            OLS   Adj. R-squared:                  0.349
Method:                 Least Squares   F-statistic:                 1.178e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:18:33   Log-Likelihood:                -8872.7
No. Observations:               65988   AIC:                         1.775e+04
Df Residuals:                   65984   BIC:                         1.779e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.2813      0.034    124.822   

In [61]:
detroit = rents[rents['region']=='detroit']
detroit.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,32713.0,32713.0,32713.0,32713.0,32713.0,32713.0,19846.0,32713.0,32713.0,32713.0,32713.0,32713.0
mean,983.096659,1.937639,1018.82053,1.000261,-83.205387,42.454045,1.334551,25.245438,6.185064,2016.542231,52462.0,4296611.0
std,392.477472,0.859662,346.534819,0.342152,0.868552,0.272896,0.509521,22.119471,5.085776,0.498221,0.0,0.0
min,79.0,0.0,240.0,0.092402,-122.5937,26.287874,0.0,1.0,1.0,2016.0,52462.0,4296611.0
25%,755.0,1.0,815.0,0.824,-83.311592,42.325125,1.0,4.0,1.0,2016.0,52462.0,4296611.0
50%,880.0,2.0,950.0,0.938889,-83.212336,42.479646,1.0,10.0,3.0,2017.0,52462.0,4296611.0
75%,1075.0,2.0,1137.0,1.097946,-83.0388,42.5877,1.5,49.0,12.0,2017.0,52462.0,4296611.0
max,6995.0,7.0,4468.0,5.752467,-68.7918,44.8242,6.0,52.0,12.0,2017.0,52462.0,4296611.0


In [62]:
dset = detroit
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.33333333333333298, 3.2418952618453898]
valid rent range: [400.0, 3655.0]
valid sqft range: [425.0, 3450.0]


In [63]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
detroit_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(detroit_filtered)

32274

In [64]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=detroit_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.364
Model:                            OLS   Adj. R-squared:                  0.364
Method:                 Least Squares   F-statistic:                     3737.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:19:33   Log-Likelihood:                 485.68
No. Observations:               19571   AIC:                            -963.4
Df Residuals:                   19567   BIC:                            -931.8
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8048      0.065     43.349   

In [65]:
denver = rents[rents['region']=='denver']
denver.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,114153.0,114153.0,114153.0,114153.0,114153.0,114153.0,69617.0,114153.0,114153.0,114153.0,114153.0,114153.0
mean,1486.556981,1.629848,964.084509,1.632192,-104.958663,39.711213,1.43024,25.843245,6.346824,2016.534152,66870.0,2754258.0
std,549.367331,0.875,438.035934,0.435288,0.492116,0.208812,0.594603,21.86221,5.004334,0.498834,0.0,0.0
min,79.0,0.0,220.0,0.087004,-148.924302,25.468229,0.0,1.0,1.0,2016.0,66870.0,2754258.0
25%,1143.0,1.0,700.0,1.36612,-105.020542,39.654307,1.0,5.0,1.0,2016.0,66870.0,2754258.0
50%,1370.0,2.0,865.0,1.554455,-104.9686,39.710799,1.0,10.0,3.0,2017.0,66870.0,2754258.0
75%,1698.0,2.0,1100.0,1.83224,-104.884853,39.757006,2.0,49.0,12.0,2017.0,66870.0,2754258.0
max,7900.0,6.0,4497.0,7.531561,-71.2983,60.924252,8.0,52.0,12.0,2017.0,66870.0,2754258.0


In [66]:
dset = denver
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.41025641025641002, 3.5729166666666701]
valid rent range: [600.0, 5000.0]
valid sqft range: [300.0, 3727.0]


In [67]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
denver_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(denver_filtered)

113023

In [68]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=denver_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.543
Model:                            OLS   Adj. R-squared:                  0.543
Method:                 Least Squares   F-statistic:                 2.735e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:03   Log-Likelihood:                 10979.
No. Observations:               68950   AIC:                        -2.195e+04
Df Residuals:                   68946   BIC:                        -2.191e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.8404      0.027    105.564   

In [69]:
newyork = rents[rents['region']=='newyork']
newyork.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,37388.0,37388.0,37388.0,37388.0,37388.0,37388.0,18535.0,37388.0,37388.0,37388.0,37388.0,37388.0
mean,2695.793249,1.713518,942.38967,3.05173,-73.856081,40.810525,1.248395,28.774313,7.047368,2016.477426,67066.0,20092883.0
std,1300.740147,1.046317,376.712246,1.31079,0.52835,0.354437,0.514592,20.799308,4.777683,0.499497,0.0,0.0
min,90.0,0.0,220.0,0.069231,-123.1147,9.001601,0.0,1.0,1.0,2016.0,67066.0,20092883.0
25%,1850.0,1.0,700.0,2.058824,-73.982037,40.706279,1.0,8.0,2.0,2016.0,67066.0,20092883.0
50%,2399.0,2.0,890.0,2.774459,-73.947049,40.746282,1.0,46.0,11.0,2016.0,67066.0,20092883.0
75%,3175.0,2.0,1100.0,3.958333,-73.806137,40.8633,1.5,48.0,12.0,2017.0,67066.0,20092883.0
max,9995.0,8.0,4400.0,8.415789,-68.441734,45.5486,8.0,52.0,12.0,2017.0,67066.0,20092883.0


In [70]:
dset = newyork
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.57894736842105299, 7.5]
valid rent range: [650.0, 9000.0]
valid sqft range: [300.0, 3100.0]


In [71]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
newyork_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(newyork_filtered)

36927

In [72]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=newyork_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.242
Model:                            OLS   Adj. R-squared:                  0.242
Method:                 Least Squares   F-statistic:                     1948.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:21:59   Log-Likelihood:                -7747.4
No. Observations:               18282   AIC:                         1.550e+04
Df Residuals:                   18278   BIC:                         1.553e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        6.9553      0.074     93.531   

In [73]:
houston = rents[rents['region']=='houston']
houston.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,106567.0,106567.0,106567.0,106567.0,106567.0,106567.0,62335.0,106567.0,106567.0,106567.0,106567.0,106567.0
mean,1290.172098,1.626348,1014.23374,1.34406,-95.456052,29.811908,1.434387,25.829319,6.328695,2016.522066,60072.0,6490180.0
std,512.344019,0.827799,428.149051,0.458061,0.493181,0.329684,0.611912,22.055356,5.034224,0.499515,0.0,0.0
min,85.0,0.0,225.0,0.065238,-158.1781,-11.469258,0.0,1.0,1.0,2016.0,60072.0,6490180.0
25%,918.0,1.0,720.0,1.023529,-95.5569,29.723139,1.0,4.0,1.0,2016.0,60072.0,6490180.0
50%,1200.0,2.0,940.0,1.258268,-95.447,29.7477,1.0,10.0,3.0,2017.0,60072.0,6490180.0
75%,1500.0,2.0,1160.0,1.610145,-95.401,29.852387,2.0,48.0,12.0,2017.0,60072.0,6490180.0
max,9500.0,7.0,4490.0,7.073846,-50.560455,47.6019,6.0,52.0,12.0,2017.0,60072.0,6490180.0


In [74]:
dset = houston
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.29004221504842298, 3.2615101289134398]
valid rent range: [385.0, 3648.0]
valid sqft range: [420.0, 3763.0]


In [75]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
houston_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(houston_filtered)

105437

In [76]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=houston_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.395
Model:                            OLS   Adj. R-squared:                  0.395
Method:                 Least Squares   F-statistic:                 1.342e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:23:19   Log-Likelihood:                -7892.0
No. Observations:               61753   AIC:                         1.579e+04
Df Residuals:                   61749   BIC:                         1.583e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        0.4165      0.040     10.475   

In [77]:
losangeles = rents[rents['region']=='losangeles']
losangeles.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,152270.0,152270.0,152270.0,152270.0,152270.0,152270.0,86283.0,152270.0,152270.0,152270.0,152270.0,152270.0
mean,2528.276089,1.624962,1021.838484,2.626747,-118.333378,34.094184,1.563066,26.31473,6.44314,2016.528036,60514.0,13262220.0
std,1279.880402,1.012909,494.005066,0.973024,0.566361,0.228387,0.680458,21.513275,4.948505,0.499215,0.0,0.0
min,84.0,0.0,217.0,0.075,-134.527588,29.770021,0.0,1.0,1.0,2016.0,60514.0,13262220.0
25%,1725.0,1.0,700.0,1.992857,-118.451747,34.0268,1.0,6.0,2.0,2016.0,60514.0,13262220.0
50%,2200.0,2.0,928.0,2.452771,-118.3663,34.0737,1.0,10.0,3.0,2017.0,60514.0,13262220.0
75%,2899.0,2.0,1200.0,3.08,-118.258012,34.162477,2.0,49.0,12.0,2017.0,60514.0,13262220.0
max,9999.0,8.0,4490.0,8.420561,-72.9862,48.1664,7.0,52.0,12.0,2017.0,60514.0,13262220.0


In [78]:
dset = losangeles
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.392063492063492, 7.4645257654966404]
valid rent range: [500.0, 9510.0]
valid sqft range: [300.0, 4000.0]


In [79]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
losangeles_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(losangeles_filtered)

150355

In [80]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=losangeles_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.440
Model:                            OLS   Adj. R-squared:                  0.440
Method:                 Least Squares   F-statistic:                 2.233e+04
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:24:08   Log-Likelihood:                -20025.
No. Observations:               85290   AIC:                         4.006e+04
Df Residuals:                   85286   BIC:                         4.009e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        2.7641      0.031     88.693   

In [81]:
seattle = rents[rents['region']=='seattle']
seattle.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,160932.0,160932.0,160932.0,160932.0,160932.0,160932.0,98920.0,160932.0,160932.0,160932.0,160932.0,160932.0
mean,1747.559665,1.594207,921.990934,2.074973,-122.307227,47.551757,1.3838,25.065848,6.170327,2016.552606,71273.0,3671478.0
std,649.299259,0.932514,419.891009,0.781669,0.412373,0.339398,0.554367,21.737498,4.978763,0.497226,0.0,0.0
min,90.0,0.0,216.0,0.075,-139.87793,28.3957,0.0,1.0,1.0,2016.0,71273.0,3671478.0
25%,1315.0,1.0,669.0,1.491818,-122.356771,47.4465,1.0,5.0,2.0,2016.0,71273.0,3671478.0
50%,1604.0,2.0,850.0,1.882083,-122.313255,47.615191,1.0,10.0,3.0,2017.0,71273.0,3671478.0
75%,1995.0,2.0,1050.0,2.625,-122.209637,47.673004,2.0,49.0,12.0,2017.0,71273.0,3671478.0
max,9000.0,8.0,4488.0,8.064516,-81.4666,49.032579,8.0,52.0,12.0,2017.0,71273.0,3671478.0


In [82]:
dset = seattle
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.45000000000000001, 5.0374531835205998]
valid rent range: [620.0, 5500.0]
valid sqft range: [247.0, 3500.0]


In [83]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
seattle_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(seattle_filtered)

159209

In [84]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=seattle_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.231
Model:                            OLS   Adj. R-squared:                  0.231
Method:                 Least Squares   F-statistic:                     9787.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:29:41   Log-Likelihood:                -15411.
No. Observations:               97975   AIC:                         3.083e+04
Df Residuals:                   97971   BIC:                         3.087e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        4.1082      0.032    128.447   

In [85]:
chicago = rents[rents['region']=='chicago']
chicago.describe()

Unnamed: 0,rent,bedrooms,sqft,rent_sqft,longitude,latitude,bathrooms,week,month,year,median_income,population
count,64279.0,64279.0,64279.0,64279.0,64279.0,64279.0,36749.0,64279.0,64279.0,64279.0,64279.0,64279.0
mean,1747.34831,1.696884,1029.364458,1.824347,-87.739262,41.89031,1.361302,27.553524,6.657166,2016.488713,61598.0,9554598.0
std,831.031145,1.001307,443.002202,0.739249,0.584812,0.344643,0.561647,22.254252,5.103591,0.499876,0.0,0.0
min,90.0,0.0,220.0,0.1,-118.546343,25.92242,0.0,1.0,1.0,2016.0,61598.0,9554598.0
25%,1195.0,1.0,735.0,1.241667,-87.732006,41.870957,1.0,4.0,1.0,2016.0,61598.0,9554598.0
50%,1595.0,2.0,963.0,1.681818,-87.656049,41.899132,1.0,45.0,11.0,2016.0,61598.0,9554598.0
75%,2100.0,2.0,1200.0,2.355114,-87.630654,41.9543,2.0,49.0,12.0,2017.0,61598.0,9554598.0
max,9950.0,8.0,4411.0,8.15,-73.9566,45.376717,5.5,52.0,12.0,2017.0,61598.0,9554598.0


In [86]:
dset = chicago
upper_percentile = 0.998
lower_percentile = 0.002

# how many rows would be within the upper and lower percentiles?
upper = int(len(dset) * upper_percentile)
lower = int(len(dset) * lower_percentile)

# get the rent/sqft values at the upper and lower percentiles
rent_sqft_sorted = dset['rent_sqft'].sort_values(ascending=True, inplace=False)
upper_rent_sqft = rent_sqft_sorted.iloc[upper]
lower_rent_sqft = rent_sqft_sorted.iloc[lower]

# get the rent values at the upper and lower percentiles
rent_sorted = dset['rent'].sort_values(ascending=True, inplace=False)
upper_rent = rent_sorted.iloc[upper]
lower_rent = rent_sorted.iloc[lower]

# get the sqft values at the upper and lower percentiles
sqft_sorted = dset['sqft'].sort_values(ascending=True, inplace=False)
upper_sqft = sqft_sorted.iloc[upper]
lower_sqft = sqft_sorted.iloc[lower]

print('valid rent_sqft range:', [lower_rent_sqft, upper_rent_sqft])
print('valid rent range:', [lower_rent, upper_rent])
print('valid sqft range:', [lower_sqft, upper_sqft])

valid rent_sqft range: [0.33333333333333298, 4.3046153846153796]
valid rent range: [450.0, 5900.0]
valid sqft range: [275.0, 3400.0]


In [87]:
# create a boolean vector mask to filter out any rows with rent_sqft outside of the reasonable values
rent_sqft_mask = (dset['rent_sqft'] > lower_rent_sqft) & (dset['rent_sqft'] < upper_rent_sqft)

# create boolean vector masks to filter out any rows with rent or sqft outside of the reasonable values
rent_mask = (dset['rent'] > lower_rent) & (dset['rent'] < upper_rent)
sqft_mask = (dset['sqft'] > lower_sqft) & (dset['sqft'] < upper_sqft)

# filter the thorough listings according to these masks
chicago_filtered = pd.DataFrame(dset[rent_sqft_mask & rent_mask & sqft_mask])
len(chicago_filtered)

63508

In [88]:
y, X = dmatrices('np.log(rent) ~ np.log(sqft) + bedrooms + bathrooms \
                 ', 
                 data=chicago_filtered, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()
residuals = res.resid
predicted = res.fittedvalues
observed = y
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:           np.log(rent)   R-squared:                       0.325
Model:                            OLS   Adj. R-squared:                  0.325
Method:                 Least Squares   F-statistic:                     5818.
Date:                Wed, 15 Mar 2017   Prob (F-statistic):               0.00
Time:                        21:30:12   Log-Likelihood:                -12975.
No. Observations:               36324   AIC:                         2.596e+04
Df Residuals:                   36320   BIC:                         2.599e+04
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------
Intercept        3.2384      0.053     61.129   