In [11]:
import pandas as pd
import numpy as np
import os
import utilities
import taylors_pipes as tpipe
import preprocessing as prepro


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score


# Read in Data 
#download_path = "/Users/tayma/datasets" # Windows 
download_path = "/Users/taylormurray/datasets" #MaciOS
file_path = os.path.join(download_path, 'housing.csv') #joins the download_path and 'housing.csv' and returns it as a new path
db = pd.read_csv(file_path)

# Create a copy of data
data = db.copy()

# We select our features here (not in pipeline since we are ignoring columns)

selected_features =['id',
                    'price',
                    'sqfeet', 
                    'beds', 'baths',
                    'type',
                    'cats_allowed',
                    'dogs_allowed',
                    'smoking_allowed',
                    'laundry_options',
                    'parking_options',
                    'state']

data = data[selected_features]

# Removal of bad values
# Columns with the bad data (might be able to get rid of this)
bad_data_columns = ['price', 
                    'sqfeet', 
                    'baths']

# Dictionary containing the columns with bad data as keys whose items are
# thresholds where values less than the threshold are considered bad data in their respective column.
bad_data_to_replace = { 'price' : 200,
                        'sqfeet' : 50,
                        'baths' : 0 } 

for col,thresh in bad_data_to_replace.items():
    col_mask = data[col] < thresh
    data = data[~col_mask]
    
# IQR Range
# We filter out rows outside the IQR range for each numeric column

data = prepro.IQRFlag(data, disclude = ['id'],invert = True,filter = True )

data


Unnamed: 0,id,price,sqfeet,beds,baths,type,cats_allowed,dogs_allowed,smoking_allowed,laundry_options,parking_options,state
0,7049044568,1148,1078,3,2.0,apartment,1,1,0,w/d in unit,carport,ca
1,7049047186,1200,1001,2,2.0,condo,0,0,0,w/d hookups,carport,ca
2,7043634882,1813,1683,2,2.0,apartment,1,1,1,w/d in unit,attached garage,ca
3,7049045324,1095,708,1,1.0,apartment,1,1,1,w/d in unit,carport,ca
4,7049043759,289,250,0,1.0,apartment,1,1,1,laundry on site,,ca
...,...,...,...,...,...,...,...,...,...,...,...,...
384972,7049053337,1295,957,2,2.0,apartment,1,1,1,w/d in unit,carport,ca
384973,7049052968,1549,1034,2,2.0,apartment,1,1,0,w/d in unit,,ca
384974,7049050454,1249,840,2,1.0,apartment,1,1,1,laundry on site,off-street parking,ca
384975,7049050149,1429,976,2,2.0,apartment,1,1,1,w/d in unit,carport,ca


In [12]:
# Split the data into a training set and a test set

split_data = utilities.split(data, id_column = 'id', test_size = 0.2, id_index = True)
test_set, train_set = split_data

# Create a label pd.Series 
y = train_set['price']

# Drop label column from training set
X = train_set.drop('price', axis =1)
print(X)

# Create list of estimators for scikit Pipeline




# pipe.ReplaceNA parameter setup.
# Set up a dictionary of whose keys are column names
# and values are what NA values in the column will be 
# replaced by. For this data set the only columns with 
# NA values are in 'laundry_options' and 'parking_options'.
# So, we only consider these columns.

columns_to_na_replace= {'laundry_options' : 'no laundry on site',
                        'parking_options' : 'no parking'}

# pipe.LogTransform parameter setup
# We apply this only for the 'sqfeet' category
# where we drop replace the orginal column and the base of the log is e

col_to_log = ['sqfeet']

# pipe.StandardizeColumns parameter setup.
# Select the columns to standardize. Here we select
# all numeric columns since we are doing a linear
# regression model

columns_to_standardize = ['sqfeet']

# pipe.OneHotEncode parameter setup.
# Select column to OneHotEncode.
# Determine if we drop original column.
# We do this for two columns separately: 'laundry_options' and 'parking_options'.


# Choose model
model = LinearRegression()



estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('log', tpipe.LogTransform(col_to_log,replace=True, verbose =True)),
                ('standardize', tpipe.StandardizeColumns(columns= columns_to_standardize, verbose = True)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]


# Feed estimators into Pipeline

pipe = Pipeline(estimators)
fitted = pipe.fit(X,y)

scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')
print(scores)
## Does very poorly


            sqfeet  beds  baths       type  cats_allowed  dogs_allowed  \
id                                                                       
7049044568    1078     3    2.0  apartment             1             1   
7043634882    1683     2    2.0  apartment             1             1   
7049045324     708     1    1.0  apartment             1             1   
7049043759     250     0    1.0  apartment             1             1   
7046327064     720     1    1.0  apartment             1             1   
...            ...   ...    ...        ...           ...           ...   
7049054520     850     2    1.0  apartment             0             0   
7049053337     957     2    2.0  apartment             1             1   
7049052968    1034     2    2.0  apartment             1             1   
7049050454     840     2    1.0  apartment             1             1   
7049050010     957     2    2.0  apartment             1             1   

            smoking_allowed  laundry_

In [None]:
predicts = fitted.predict(X)
print(predicts)
print(y)
print(X.index.equals(y.index))

[1702.6875 2126.3125 1519.1875 ... 1790.4375 1385.3125 1663.4375]
id
7049044568    1148
7043634882    1813
7049045324    1095
7049043759     289
7046327064    1093
              ... 
7049054520    1200
7049053337    1295
7049052968    1549
7049050454    1249
7049050010    1295
Name: price, Length: 274633, dtype: int64
True


In [None]:
base_model = LinearRegression()
scores = cross_val_score(base_model, X.select_dtypes(include =np.number), y, cv = 5, scoring = 'neg_root_mean_squared_error')
print(scores)

[-416.08441628 -350.96949977 -335.76736906 -323.74178773 -367.89131626]


In [16]:
from sklearn.preprocessing import PowerTransformer

model = LinearRegression()

estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('standardize', tpipe.StandardizeColumns(columns= columns_to_standardize, verbose = True)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]
pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')
print(scores)

[-8.98905973e+13 -1.65097628e+14 -1.55574961e+14 -2.11046024e+14
 -1.15117553e+14]


In [None]:
model = LinearRegression()
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('log', tpipe.LogTransform(columns = ['sqfeet'], replace = True, offset=2, verbose = True)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')
print(scores)

[-1.09899612e+14 -3.00968290e+14 -1.19968530e+13 -8.51089060e+13
 -2.18723344e+14]


In [18]:
model = LinearRegression()
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')
print(scores)

[-5.60165419e+09 -3.22498619e+02 -3.18039142e+02 -2.82045890e+02
 -3.11729336e+02]


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')



KeyboardInterrupt: 

In [24]:
print(scores)

baseline_rmse = np.sqrt(np.mean((y-y.mean())**2))
print(baseline_rmse)

[-365.7381124  -320.1581199  -305.69674207 -297.10049032 -329.91834875]
381.22532227632354


In [26]:
model = RandomForestRegressor()
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('log', tpipe.LogTransform(columns = ['sqfeet'], replace = True, offset=2, verbose = True)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')

In [27]:
print(scores)

[-365.72355326 -320.74872903 -306.46351773 -296.43863289 -330.45046042]


In [28]:
model = RandomForestRegressor(n_estimators= 800)
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')


In [29]:
print(scores)

[-365.20279626 -319.45973948 -305.32699637 -296.20649902 -329.99536349]


In [33]:
model = RandomForestRegressor(min_samples_leaf= 2)
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')

In [34]:
print(scores)

[-365.09724565 -320.98982349 -303.88100748 -296.73221637 -328.23889443]


In [31]:
model = RandomForestRegressor(max_features = 4)
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')

In [32]:
print(scores)

[-326.60480493 -319.11328809 -297.23824874 -280.94524406 -305.31092167]


In [5]:
model = RandomForestRegressor(max_features = 50)
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')

In [6]:
print(scores)

[-360.61250414 -317.51504765 -303.49490045 -293.62599454 -326.05229173]


In [7]:
model = RandomForestRegressor(max_features =25)
estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

pipe = Pipeline(estimators)
scores = cross_val_score(pipe, X,y,cv = 5, scoring = 'neg_root_mean_squared_error')

In [8]:
print(scores)

[-353.79206298 -315.14586752 -297.94529726 -289.1669099  -318.82300482]


In [27]:
# Graphic display of these cross_val_scores rmse with respect to changes in hyperparameter
    

# Make a dataframe capturing this information
import time




hyper_dic = {'max_features':[1,1,1,1,1,
                            1,1,1,1,1,
                            5,5,5,5,5,
                            10,10,10,10,10,
                            15,15,15,15,15], 
            'min_samples_leaf' : [2,2,2,2,2,
                                4,4,4,4,4,
                                2,2,2,2,2,
                                2,2,2,2,2,
                                2,2,2,2,2], 
            'n_estimators' :[1,20,30,40,50,
                            1,20,30,40,50,
                            1,20,30,40,50,
                            1,20,30,40,50,
                            1,20,30,40,50]}
hyper_cross_val = pd.DataFrame(hyper_dic)


def cross_val_tree_reg(p):
    max_features = p['max_features']
    min_samples_leaf = p['min_samples_leaf']
    n_estimators = p['n_estimators']
    model = RandomForestRegressor(n_estimators= n_estimators, max_features = max_features, min_samples_leaf=min_samples_leaf)
    estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]
    pipe = Pipeline(estimators)
    start_time = time.time()
    scores = cross_val_score(pipe, X,y, cv=5, scoring = 'neg_root_mean_squared_error')
    end_time = time.time()
    elapsed_time = end_time - start_time
    scores = -scores
    return sum(scores)/len(scores), elapsed_time
    
hyper_cross_val['cross_val_scores_average']= hyper_cross_val.apply(cross_val_tree_reg, axis =1)



In [28]:
hyper_cross_val

Unnamed: 0,max_features,min_samples_leaf,n_estimators,cross_val_scores_average
0,1,2,1,"(360.56244661460846, 6.28127121925354)"
1,1,2,20,"(312.88098295610456, 15.392810821533203)"
2,1,2,30,"(312.2647030397251, 20.02912402153015)"
3,1,2,40,"(311.89742889554816, 25.074212789535522)"
4,1,2,50,"(313.25072577393314, 29.41161012649536)"
5,1,4,1,"(358.0956571664798, 5.884401798248291)"
6,1,4,20,"(323.7997668116952, 11.47649598121643)"
7,1,4,30,"(325.6551245660824, 14.60989785194397)"
8,1,4,40,"(324.3673997934271, 17.80042004585266)"
9,1,4,50,"(321.7797215139191, 21.181796073913574)"


0    0.5
1    0.5
2    0.5
3    0.5
4    0.5
dtype: float64