In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math
from scipy import stats

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

import re

In [3]:
data_path = "./main_nyc_rent_float_vals.csv"

fit_df = pd.DataFrame(pd.read_csv('./fit_toy_set.csv'))

In [4]:
# impute missing sq-ft data

X = fit_df[['Num_Beds', 'Num_Baths', 'Rent']]
y = fit_df[['Sq-Ft']]

x_train, x_test, y_train, y_test = train_test_split(X,y,test_size=.3,random_state=1)

model = LinearRegression()
model.fit(x_train, y_train)


LinearRegression()

In [5]:
print(model.score(x_test, y_test))

preds = model.predict(x_test).round(2).flatten()
actual = y_test.to_numpy().flatten()

print('Predicted:', preds[:20])
print('Actual: ', actual[:20])

0.8762176895226317
Predicted: [ 520.72  735.95 1275.4   693.59 1215.49  682.14  861.03  683.77  528.08
  645.59 1589.57  654.3   939.49  752.47  525.62  851.22 1447.71  521.53
 1447.71  820.52]
Actual:  [ 535.25  691.5  1284.25  500.   1294.25  754.    893.    635.    535.25
  636.   1400.    809.    991.    691.5   500.    884.75 1292.5   535.25
 1292.5   631.  ]


In [6]:
confidence = 0.95

abs_errors = abs(preds - actual)
stats.t.interval(confidence, len(abs_errors)-1, 
                         loc=abs_errors.mean(), 
                         scale=stats.sem(abs_errors))

(75.43828522435494, 84.32754430914208)

In [7]:
bronx = "./CombinedData/bronxcombined.csv"
brooklyn = "./CombinedData/brooklyncombined.csv"
manhattan = "./CombinedData/manhattancombined.csv"
staten = "./CombinedData/statenislandcombined.csv"
queens = "./CombinedData/queenscombined.csv"

bronx_df = pd.DataFrame(pd.read_csv(bronx))
brook_df = pd.DataFrame(pd.read_csv(brooklyn))
manhat_df = pd.DataFrame(pd.read_csv(manhattan))
queens_df = pd.DataFrame(pd.read_csv(queens))
staten_df = pd.DataFrame(pd.read_csv(staten))

In [8]:
def preprocess_sqft(sqft):
    sqft = str(sqft)
    sqft = re.sub("\+", "", sqft)
    sqft = re.sub(",", "", sqft)
    return sqft

In [9]:
ans = preprocess_sqft(',1+.000')
print(ans)
print(float(ans))

1.000
1.0


In [10]:
bronx_df['Sq-Ft'] = bronx_df['Sq-Ft'].apply(lambda x:float(preprocess_sqft(x)))
brook_df['Sq-Ft'] = brook_df['Sq-Ft'].apply(lambda x:float(preprocess_sqft(x)))
manhat_df['Sq-Ft'] = manhat_df['Sq-Ft'].apply(lambda x:float(preprocess_sqft(x)))
queens_df['Sq-Ft'] = queens_df['Sq-Ft'].apply(lambda x:float(preprocess_sqft(x)))
staten_df['Sq-Ft'] = staten_df['Sq-Ft'].apply(lambda x:float(preprocess_sqft(x)))

In [11]:
bronx_df['Num_Beds'] = bronx_df['Num_Beds'].apply(lambda x:float(preprocess_sqft(x)))
brook_df['Num_Beds'] = brook_df['Num_Beds'].apply(lambda x:float(preprocess_sqft(x)))
manhat_df['Num_Beds'] = manhat_df['Num_Beds'].apply(lambda x:float(preprocess_sqft(x)))
queens_df['Num_Beds'] = queens_df['Num_Beds'].apply(lambda x:float(preprocess_sqft(x)))
staten_df['Num_Beds'] = staten_df['Num_Beds'].apply(lambda x:float(preprocess_sqft(x)))

In [12]:
bronx_df['Num_Baths'] = bronx_df['Num_Baths'].apply(lambda x:float(preprocess_sqft(x)))
brook_df['Num_Baths'] = brook_df['Num_Baths'].apply(lambda x:float(preprocess_sqft(x)))
manhat_df['Num_Baths'] = manhat_df['Num_Baths'].apply(lambda x:float(preprocess_sqft(x)))
queens_df['Num_Baths'] = queens_df['Num_Baths'].apply(lambda x:float(preprocess_sqft(x)))
staten_df['Num_Baths'] = staten_df['Num_Baths'].apply(lambda x:float(preprocess_sqft(x)))

In [13]:
# fill in missing square feet
predictors = ['Num_Beds', 'Num_Baths', 'Rent']

bronx_preds = model.predict(bronx_df[predictors]).flatten().round(2)
brook_preds = model.predict(brook_df[predictors]).flatten().round(2)
manhat_preds = model.predict(manhat_df[predictors]).flatten().round(2)
queens_preds = model.predict(queens_df[predictors]).flatten().round(2)
staten_preds = model.predict(staten_df[predictors]).flatten().round(2)


bronx_df['Sq-Ft'] = np.where(bronx_df['Sq-Ft'] > 20, bronx_df['Sq-Ft'], bronx_preds)
brook_df['Sq-Ft'] = np.where(brook_df['Sq-Ft'] > 20, brook_df['Sq-Ft'], brook_preds)
manhat_df['Sq-Ft'] = np.where(manhat_df['Sq-Ft'] > 20, manhat_df['Sq-Ft'], manhat_preds)
queens_df['Sq-Ft'] = np.where(queens_df['Sq-Ft'] > 20, queens_df['Sq-Ft'], queens_preds)
staten_df['Sq-Ft'] = np.where(staten_df['Sq-Ft'] > 20, staten_df['Sq-Ft'], staten_preds)


In [14]:
bronx_preds[:30].flatten()

array([ 841.96,  848.76,  631.43,  833.22,  675.  ,  628.16,  642.88,
        846.31,  571.95,  629.79,  622.43,  581.76,  825.86, 1171.26,
       1240.69,  677.69, 1182.71,  626.52, 1439.31, 1048.1 ,  637.15,
        847.94,  838.13, 1064.46,  847.78,  625.7 ,  570.38,  686.32,
        570.38,  580.62])

In [15]:
bronx_df.to_csv('./ImputedSets/bronx_imputed.csv', index=False)
brook_df.to_csv('./ImputedSets/brook_imputed.csv', index=False)
manhat_df.to_csv('./ImputedSets/manhat_imputed.csv', index=False)
queens_df.to_csv('./ImputedSets/queens_imputed.csv', index=False)
staten_df.to_csv('./ImputedSets/staten_imputed.csv', index=False)