In [None]:
import pandas as pd
import numpy as np
import os
import utilities
import taylors_pipes as tpipe
import preprocessing as prepro


from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

# Read in Data 
# download_path = "/Users/tayma/datasets" # Windows 
download_path = "/Users/taylormurray/datasets" #MaciOS
file_path = os.path.join(download_path, 'housing.csv') #joins the download_path and 'housing.csv' and returns it as a new path
db = pd.read_csv(file_path)

# Create a copy of data
data = db.copy()

# We select our features here (not in pipeline since we are ignoring columns)

selected_features =['id',
                    'price',
                    'sqfeet', 
                    'beds', 'baths',
                    'type',
                    'cats_allowed',
                    'dogs_allowed',
                    'smoking_allowed',
                    'laundry_options',
                    'parking_options',
                    'state']

data = data[selected_features]

# Removal of bad values
# Columns with the bad data (might be able to get rid of this)
bad_data_columns = ['price', 
                    'sqfeet', 
                    'baths']

# Dictionary containing the columns with bad data as keys whose items are
# thresholds where values less than the threshold are considered bad data in their respective column.
bad_data_to_replace = { 'price' : 200,
                        'sqfeet' : 50,
                        'baths' : 0 } 

for col,thresh in bad_data_to_replace.items():
    col_mask = data[col] < thresh
    data = data[~col_mask]
    
# IQR Range
# We delete rows outside IQR range

prepro.IQRFilter(data, col ='sqfeet', invert = True)



In [13]:
# Split the data into a training set and a test set

split_data = utilities.split(data, id_column = 'id', test_size = 0.2)
test_set, train_set = split_data

# Create a label pd.Series 
y = train_set['price']

# Drop label column from training set
X = train_set.drop('price', axis =1)

# Create list of estimators for scikit Pipeline




# pipe.ReplaceNA parameter setup.
# Set up a dictionary of whose keys are column names
# and values are what NA values in the column will be 
# replaced by. For this data set the only columns with 
# NA values are in 'laundry_options' and 'parking_options'.
# So, we only consider these columns.

columns_to_na_replace= {'laundry_options' : 'no laundry on site',
                        'parking_options' : 'no parking'}

# pipe.LogTransform parameter setup
# We apply this only for the 'sqfeet' category
# where we drop replace the orginal column and the base of the log is e

col_to_log = ['sqfeet']

# pipe.StandardizeColumns parameter setup.
# Select the columns to standardize. Here we select
# all numeric columns since we are doing a linear
# regression model

columns_to_standardize = X.select_dtypes(include = np.number).columns

# pipe.OneHotEncode parameter setup.
# Select column to OneHotEncode.
# Determine if we drop original column.
# We do this for two columns separately: 'laundry_options' and 'parking_options'.


# Choose model
model = LinearRegression()



estimators = [('replace_na', tpipe.ReplaceNA(columns_to_replacement = columns_to_na_replace)),
                ('log', tpipe.LogTransform(col_to_log,replace=True)),
                ('standardize', tpipe.StandardizeColumns(columns = columns_to_standardize)), 
                ('onehot1', tpipe.OneHotEncode(col = 'parking_options', drop = True)),
                ('onehot2', tpipe.OneHotEncode(col = 'laundry_options', drop = True)),
                ('onehot3', tpipe.OneHotEncode(col = 'type', drop = True)),
                ('onehot4', tpipe.OneHotEncode(col = 'state', drop = True)),
                ('chosen_model', model)]

# Feed estimators into Pipeline

pipe = Pipeline(estimators)
fitted = pipe.fit(X,y)

scores = cross_val_score(pipe, X,y,cv = 10, scoring = 'neg_mean_squared_error')
print(scores)

#predicts = fitted.predict(X)
#print(predicts)

[-8.65090353e+09 -2.50966718e+14 -1.24080749e+10 -3.13451444e+09
 -5.72364090e+09 -2.51117237e+10 -6.88037938e+08 -8.48949754e+08
 -5.94985997e+08 -1.60007138e+10]
