<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 2: Ames Housing Data and Kaggle Challenge (Kaggle Data Export)

## Import Libraries

In [1]:
# Imports:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import statistics

from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.dummy import DummyRegressor

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score, mean_squared_error

#Ignore warnings
import warnings
warnings.filterwarnings("ignore")

## Data Import & Cleaning

In [2]:
#dataset for model testing
test = pd.read_csv("../datasets/test.csv")

#dataset for model training
train_final8 = pd.read_csv("../datasets/train_final8.csv")

#change display options
pd.set_option("display.max_rows", 160)
pd.set_option("display.max_columns", 80)

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 80 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Id               878 non-null    int64  
 1   PID              878 non-null    int64  
 2   MS SubClass      878 non-null    int64  
 3   MS Zoning        878 non-null    object 
 4   Lot Frontage     718 non-null    float64
 5   Lot Area         878 non-null    int64  
 6   Street           878 non-null    object 
 7   Alley            58 non-null     object 
 8   Lot Shape        878 non-null    object 
 9   Land Contour     878 non-null    object 
 10  Utilities        878 non-null    object 
 11  Lot Config       878 non-null    object 
 12  Land Slope       878 non-null    object 
 13  Neighborhood     878 non-null    object 
 14  Condition 1      878 non-null    object 
 15  Condition 2      878 non-null    object 
 16  Bldg Type        878 non-null    object 
 17  House Style     

In [3]:
# Extracting features used in model
test_1 = test.loc[:,["Overall Qual","Gr Liv Area","Total Bsmt SF","Garage Area","Exter Qual",
                     "Exter Cond","Bsmt Full Bath","Full Bath", "Bsmt Half Bath", "Half Bath", "Year Built", "Bsmt Qual","Bsmt Cond","Bsmt Exposure","Garage Finish","Fireplace Qu","Mas Vnr Area","Heating QC",
                    "Neighborhood","BsmtFin SF 1","Wood Deck SF","Open Porch SF","Enclosed Porch","3Ssn Porch","Screen Porch"]]


In [4]:
# Fill null values with none for Bsmt Qual
test_1["Bsmt Qual"] = test_1["Bsmt Qual"].fillna("None")

# Fill null values with none for Bsmt Cond
test_1["Bsmt Cond"] = test_1["Bsmt Cond"].fillna("None")

# Fill null values with none for Bsmt Exposure
test_1["Bsmt Exposure"] = test_1["Bsmt Exposure"].fillna("None")

# Fill null values with none for Garage Finish
test_1["Garage Finish"] = test_1["Garage Finish"].fillna("None")

# Fill null value with none in Fireplace Qu
test_1["Fireplace Qu"] = test_1["Fireplace Qu"].fillna("None")

# Fill null value with none in Mas Vnr Area
test_1["Mas Vnr Area"] = test_1["Mas Vnr Area"].fillna(0)

In [5]:
test_final = test_1
test_final.columns = test_final.columns.str.lower()
test_final.columns = [col.replace(" ", "_") for col in test_final.columns]
test_final.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 25 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   overall_qual    878 non-null    int64  
 1   gr_liv_area     878 non-null    int64  
 2   total_bsmt_sf   878 non-null    int64  
 3   garage_area     878 non-null    int64  
 4   exter_qual      878 non-null    object 
 5   exter_cond      878 non-null    object 
 6   bsmt_full_bath  878 non-null    int64  
 7   full_bath       878 non-null    int64  
 8   bsmt_half_bath  878 non-null    int64  
 9   half_bath       878 non-null    int64  
 10  year_built      878 non-null    int64  
 11  bsmt_qual       878 non-null    object 
 12  bsmt_cond       878 non-null    object 
 13  bsmt_exposure   878 non-null    object 
 14  garage_finish   878 non-null    object 
 15  fireplace_qu    878 non-null    object 
 16  mas_vnr_area    878 non-null    float64
 17  heating_qc      878 non-null    obj

### _Creating new features required in model_

In [6]:
# total_area
test_final["c_total_area"] = test_final["gr_liv_area"] + test_final["total_bsmt_sf"]

#change to ordinal format for exter_qual
rate = {'Ex': 5, 'Gd': 4, 'TA': 3, 'Fa': 2, 'Po': 1, 'None': 0}
test_final["c_ord_exter_qual"] = test_final["exter_qual"].map(rate)

#change to ordinal format for exter_cond
test_final["c_ord_exter_cond"] = test_final["exter_cond"].map(rate)

#total ordinal values for exter_cond and qual
test_final["c_tot_exter"] = test_final["c_ord_exter_qual"] + test_final["c_ord_exter_cond"]

#total_full_bath
test_final["c_total_full_bath"] = test_final["bsmt_full_bath"] + test_final["full_bath"]

#total_half_bath
test_final["c_total_half_bath"] = test_final["bsmt_half_bath"] + test_final["half_bath"]

#total_bath
test_final["c_total_bath"] = test_final["c_total_full_bath"] + test_final["c_total_half_bath"]

#change to ordinal format for bsmt_qual
test_final["c_ord_bsmt_qual"] = test_final["bsmt_qual"].map(rate)

#change to ordinal format for bsmt_cond
test_final["c_ord_bsmt_cond"] = test_final["bsmt_cond"].map(rate)

#change to ordinal format for bsmt_exposure
rate_1 = {'Gd': 4, 'Av': 3, 'Mn': 2, 'No': 1, 'None': 0}
test_final["c_ord_bsmt_exp"] = test_final["bsmt_exposure"].map(rate_1)

#total ordinal values for bsmt_cond, qual, exposure
test_final["c_tot_bsmt"] = test_final["c_ord_bsmt_qual"]+test_final["c_ord_bsmt_cond"]+test_final["c_ord_bsmt_exp"]

#change to ordinal format for garage_finish
rate_3 = {'Fin': 3, 'RFn': 2, 'Unf': 1, 'None': 0}
test_final["c_ord_garage_finish"] = test_final["garage_finish"].map(rate_3)

#change to ordinal format for fireplace_qu
test_final["c_ord_fireplace_qu"] = test_final["fireplace_qu"].map(rate)

#change to ordinal format for heating_qc
test_final["c_ord_heating_qc"] = test_final["heating_qc"].map(rate)

#total porch area
test_final["c_total_porch_area"] = test_final["wood_deck_sf"] + test_final["open_porch_sf"] + test_final["enclosed_porch"] + test_final["3ssn_porch"] + test_final["screen_porch"]

### **_Creating OHE dummies for categorical datatypes_**

In [7]:
# Apply OneHotEncoder on categorical data
ohe = OneHotEncoder(sparse= False, handle_unknown='ignore')

In [8]:
# Filtering out columns which are objects from train_final5
cat_col = ["neighborhood"]
X_test = test_final[cat_col]

In [9]:
# Fitting and transforming the dataset
ohe.fit(X_test)

OneHotEncoder(handle_unknown='ignore', sparse=False)

In [10]:
X_test_encoded = ohe.transform(X_test)

In [11]:
print(X_test_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [12]:
# Extracting column names for the OHE array
column_name = ohe.get_feature_names_out(cat_col)

In [13]:
# Creating dataframe
X_test_1 = pd.DataFrame(X_test_encoded,columns=column_name)

#### Combine OHE values with dataset

In [14]:
# Attaching OHE data to dataframe
test_final1 = pd.concat([test_final.reset_index(drop=True),X_test_1.reset_index(drop = True)], axis = 1)

In [15]:
# Creating dataset with features from model
X_predict = test_final1.loc[:,['overall_qual',
 'c_total_area',
 'garage_area',
 'c_tot_exter',
 'c_total_bath',
 'year_built',
 'c_tot_bsmt',
 'c_ord_garage_finish',
 'c_ord_fireplace_qu',
 'mas_vnr_area',
 'c_ord_heating_qc',
 'neighborhood_NridgHt',
 'bsmtfin_sf_1',
 'c_total_porch_area',]]

X_predict.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878 entries, 0 to 877
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   overall_qual          878 non-null    int64  
 1   c_total_area          878 non-null    int64  
 2   garage_area           878 non-null    int64  
 3   c_tot_exter           878 non-null    int64  
 4   c_total_bath          878 non-null    int64  
 5   year_built            878 non-null    int64  
 6   c_tot_bsmt            878 non-null    int64  
 7   c_ord_garage_finish   878 non-null    int64  
 8   c_ord_fireplace_qu    878 non-null    int64  
 9   mas_vnr_area          878 non-null    float64
 10  c_ord_heating_qc      878 non-null    int64  
 11  neighborhood_NridgHt  878 non-null    float64
 12  bsmtfin_sf_1          878 non-null    int64  
 13  c_total_porch_area    878 non-null    int64  
dtypes: float64(2), int64(12)
memory usage: 96.2 KB


In [16]:
# Check for any null values
X_predict.isnull().sum()

overall_qual            0
c_total_area            0
garage_area             0
c_tot_exter             0
c_total_bath            0
year_built              0
c_tot_bsmt              0
c_ord_garage_finish     0
c_ord_fireplace_qu      0
mas_vnr_area            0
c_ord_heating_qc        0
neighborhood_NridgHt    0
bsmtfin_sf_1            0
c_total_porch_area      0
dtype: int64

In [17]:
# Checks to see if there is difference in number of features between test and train sets
set(train_final8.columns) - set(X_predict.columns)

{'saleprice'}

In [18]:
# Checks to see if there is difference in number of features between test and train sets
set(X_predict.columns) - set(train_final8.columns)

set()

## Model Prep

### Assembling Predictor Variables (X) and Target (y)

**X, y variable**

In [19]:
# Assembling predictor variable (X) - > 0.6 R square value, categorical values
X_train = train_final8.iloc[:,range(0,(len(train_final8.columns)-1))]

In [20]:
# y target variable
y_train = train_final8["saleprice"]

#### X_test variable

In [21]:
X_predict

Unnamed: 0,overall_qual,c_total_area,garage_area,c_tot_exter,c_total_bath,year_built,c_tot_bsmt,c_ord_garage_finish,c_ord_fireplace_qu,mas_vnr_area,c_ord_heating_qc,neighborhood_NridgHt,bsmtfin_sf_1,c_total_porch_area
0,6,2948,440,5,2,1910,6,1,0,0.0,4,0.0,0,172
1,5,3934,580,6,2,1977,8,3,0,0.0,3,0.0,0,170
2,7,2150,426,7,4,2006,11,2,4,0.0,5,0.0,554,124
3,5,1936,480,7,1,1923,7,1,0,0.0,3,0.0,0,184
4,6,2788,514,6,3,1963,8,2,4,247.0,4,0.0,609,261
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
873,6,2961,488,6,4,1974,7,1,3,0.0,3,0.0,931,96
874,6,3092,480,8,3,1966,7,2,4,410.0,5,0.0,575,230
875,5,2163,322,6,1,1968,7,1,3,0.0,5,0.0,250,63
876,4,1728,528,6,1,1971,7,1,0,0.0,3,0.0,616,0


### Feature Engineering - Polynomial Features

In [22]:
features_X_train = X_train.columns.to_list()

In [23]:
poly = PolynomialFeatures(include_bias=False)

In [24]:
#fitting the polynomial method - only need to be done on train dataset
poly.fit(X_train)

PolynomialFeatures(include_bias=False)

In [25]:
# transform and store settings in train/test dataframe
X_train_poly = poly.transform(X_train)
X_predict_poly = poly.transform(X_predict)

In [26]:
X_train_poly1 = pd.DataFrame(X_train_poly, columns= poly.get_feature_names(features_X_train))
X_train_poly1.head()

Unnamed: 0,overall_qual,c_total_area,garage_area,c_tot_exter,c_total_bath,year_built,c_tot_bsmt,c_ord_garage_finish,c_ord_fireplace_qu,mas_vnr_area,c_ord_heating_qc,neighborhood_NridgHt,bsmtfin_sf_1,c_total_porch_area,overall_qual^2,overall_qual c_total_area,overall_qual garage_area,overall_qual c_tot_exter,overall_qual c_total_bath,overall_qual year_built,overall_qual c_tot_bsmt,overall_qual c_ord_garage_finish,overall_qual c_ord_fireplace_qu,overall_qual mas_vnr_area,overall_qual c_ord_heating_qc,overall_qual neighborhood_NridgHt,overall_qual bsmtfin_sf_1,overall_qual c_total_porch_area,c_total_area^2,c_total_area garage_area,c_total_area c_tot_exter,c_total_area c_total_bath,c_total_area year_built,c_total_area c_tot_bsmt,c_total_area c_ord_garage_finish,c_total_area c_ord_fireplace_qu,c_total_area mas_vnr_area,c_total_area c_ord_heating_qc,c_total_area neighborhood_NridgHt,c_total_area bsmtfin_sf_1,...,year_built c_ord_heating_qc,year_built neighborhood_NridgHt,year_built bsmtfin_sf_1,year_built c_total_porch_area,c_tot_bsmt^2,c_tot_bsmt c_ord_garage_finish,c_tot_bsmt c_ord_fireplace_qu,c_tot_bsmt mas_vnr_area,c_tot_bsmt c_ord_heating_qc,c_tot_bsmt neighborhood_NridgHt,c_tot_bsmt bsmtfin_sf_1,c_tot_bsmt c_total_porch_area,c_ord_garage_finish^2,c_ord_garage_finish c_ord_fireplace_qu,c_ord_garage_finish mas_vnr_area,c_ord_garage_finish c_ord_heating_qc,c_ord_garage_finish neighborhood_NridgHt,c_ord_garage_finish bsmtfin_sf_1,c_ord_garage_finish c_total_porch_area,c_ord_fireplace_qu^2,c_ord_fireplace_qu mas_vnr_area,c_ord_fireplace_qu c_ord_heating_qc,c_ord_fireplace_qu neighborhood_NridgHt,c_ord_fireplace_qu bsmtfin_sf_1,c_ord_fireplace_qu c_total_porch_area,mas_vnr_area^2,mas_vnr_area c_ord_heating_qc,mas_vnr_area neighborhood_NridgHt,mas_vnr_area bsmtfin_sf_1,mas_vnr_area c_total_porch_area,c_ord_heating_qc^2,c_ord_heating_qc neighborhood_NridgHt,c_ord_heating_qc bsmtfin_sf_1,c_ord_heating_qc c_total_porch_area,neighborhood_NridgHt^2,neighborhood_NridgHt bsmtfin_sf_1,neighborhood_NridgHt c_total_porch_area,bsmtfin_sf_1^2,bsmtfin_sf_1 c_total_porch_area,c_total_porch_area^2
0,6.0,2204.0,475.0,7.0,3.0,1976.0,7.0,2.0,0.0,289.0,5.0,0.0,533.0,44.0,36.0,13224.0,2850.0,42.0,18.0,11856.0,42.0,12.0,0.0,1734.0,30.0,0.0,3198.0,264.0,4857616.0,1046900.0,15428.0,6612.0,4355104.0,15428.0,4408.0,0.0,636956.0,11020.0,0.0,1174732.0,...,9880.0,0.0,1053208.0,86944.0,49.0,14.0,0.0,2023.0,35.0,0.0,3731.0,308.0,4.0,0.0,578.0,10.0,0.0,1066.0,88.0,0.0,0.0,0.0,0.0,0.0,0.0,83521.0,1445.0,0.0,154037.0,12716.0,25.0,0.0,2665.0,220.0,0.0,0.0,0.0,284089.0,23452.0,1936.0
1,7.0,3035.0,559.0,7.0,4.0,1996.0,8.0,2.0,3.0,132.0,5.0,0.0,637.0,74.0,49.0,21245.0,3913.0,49.0,28.0,13972.0,56.0,14.0,21.0,924.0,35.0,0.0,4459.0,518.0,9211225.0,1696565.0,21245.0,12140.0,6057860.0,24280.0,6070.0,9105.0,400620.0,15175.0,0.0,1933295.0,...,9980.0,0.0,1271452.0,147704.0,64.0,16.0,24.0,1056.0,40.0,0.0,5096.0,592.0,4.0,6.0,264.0,10.0,0.0,1274.0,148.0,9.0,396.0,15.0,0.0,1911.0,222.0,17424.0,660.0,0.0,84084.0,9768.0,25.0,0.0,3185.0,370.0,0.0,0.0,0.0,405769.0,47138.0,5476.0
2,5.0,2114.0,246.0,7.0,2.0,1953.0,7.0,1.0,0.0,0.0,3.0,0.0,731.0,52.0,25.0,10570.0,1230.0,35.0,10.0,9765.0,35.0,5.0,0.0,0.0,15.0,0.0,3655.0,260.0,4468996.0,520044.0,14798.0,4228.0,4128642.0,14798.0,2114.0,0.0,0.0,6342.0,0.0,1545334.0,...,5859.0,0.0,1427643.0,101556.0,49.0,7.0,0.0,0.0,21.0,0.0,5117.0,364.0,1.0,0.0,0.0,3.0,0.0,731.0,52.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,2193.0,156.0,0.0,0.0,0.0,534361.0,38012.0,2704.0
3,5.0,1828.0,400.0,6.0,3.0,2006.0,8.0,3.0,0.0,0.0,4.0,0.0,0.0,100.0,25.0,9140.0,2000.0,30.0,15.0,10030.0,40.0,15.0,0.0,0.0,20.0,0.0,0.0,500.0,3341584.0,731200.0,10968.0,5484.0,3666968.0,14624.0,5484.0,0.0,0.0,7312.0,0.0,0.0,...,8024.0,0.0,0.0,200600.0,64.0,24.0,0.0,0.0,32.0,0.0,0.0,800.0,9.0,0.0,0.0,12.0,0.0,0.0,300.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,400.0,0.0,0.0,0.0,0.0,0.0,10000.0
4,6.0,2121.0,484.0,6.0,2.0,1900.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,59.0,36.0,12726.0,2904.0,36.0,12.0,11400.0,42.0,6.0,0.0,0.0,18.0,0.0,0.0,354.0,4498641.0,1026564.0,12726.0,4242.0,4029900.0,14847.0,2121.0,0.0,0.0,6363.0,0.0,0.0,...,5700.0,0.0,0.0,112100.0,49.0,7.0,0.0,0.0,21.0,0.0,0.0,413.0,1.0,0.0,0.0,3.0,0.0,0.0,59.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,177.0,0.0,0.0,0.0,0.0,0.0,3481.0


In [27]:
X_predict_1 = pd.DataFrame(X_predict_poly, columns= poly.get_feature_names(features_X_train))
X_predict_1.head()

Unnamed: 0,overall_qual,c_total_area,garage_area,c_tot_exter,c_total_bath,year_built,c_tot_bsmt,c_ord_garage_finish,c_ord_fireplace_qu,mas_vnr_area,c_ord_heating_qc,neighborhood_NridgHt,bsmtfin_sf_1,c_total_porch_area,overall_qual^2,overall_qual c_total_area,overall_qual garage_area,overall_qual c_tot_exter,overall_qual c_total_bath,overall_qual year_built,overall_qual c_tot_bsmt,overall_qual c_ord_garage_finish,overall_qual c_ord_fireplace_qu,overall_qual mas_vnr_area,overall_qual c_ord_heating_qc,overall_qual neighborhood_NridgHt,overall_qual bsmtfin_sf_1,overall_qual c_total_porch_area,c_total_area^2,c_total_area garage_area,c_total_area c_tot_exter,c_total_area c_total_bath,c_total_area year_built,c_total_area c_tot_bsmt,c_total_area c_ord_garage_finish,c_total_area c_ord_fireplace_qu,c_total_area mas_vnr_area,c_total_area c_ord_heating_qc,c_total_area neighborhood_NridgHt,c_total_area bsmtfin_sf_1,...,year_built c_ord_heating_qc,year_built neighborhood_NridgHt,year_built bsmtfin_sf_1,year_built c_total_porch_area,c_tot_bsmt^2,c_tot_bsmt c_ord_garage_finish,c_tot_bsmt c_ord_fireplace_qu,c_tot_bsmt mas_vnr_area,c_tot_bsmt c_ord_heating_qc,c_tot_bsmt neighborhood_NridgHt,c_tot_bsmt bsmtfin_sf_1,c_tot_bsmt c_total_porch_area,c_ord_garage_finish^2,c_ord_garage_finish c_ord_fireplace_qu,c_ord_garage_finish mas_vnr_area,c_ord_garage_finish c_ord_heating_qc,c_ord_garage_finish neighborhood_NridgHt,c_ord_garage_finish bsmtfin_sf_1,c_ord_garage_finish c_total_porch_area,c_ord_fireplace_qu^2,c_ord_fireplace_qu mas_vnr_area,c_ord_fireplace_qu c_ord_heating_qc,c_ord_fireplace_qu neighborhood_NridgHt,c_ord_fireplace_qu bsmtfin_sf_1,c_ord_fireplace_qu c_total_porch_area,mas_vnr_area^2,mas_vnr_area c_ord_heating_qc,mas_vnr_area neighborhood_NridgHt,mas_vnr_area bsmtfin_sf_1,mas_vnr_area c_total_porch_area,c_ord_heating_qc^2,c_ord_heating_qc neighborhood_NridgHt,c_ord_heating_qc bsmtfin_sf_1,c_ord_heating_qc c_total_porch_area,neighborhood_NridgHt^2,neighborhood_NridgHt bsmtfin_sf_1,neighborhood_NridgHt c_total_porch_area,bsmtfin_sf_1^2,bsmtfin_sf_1 c_total_porch_area,c_total_porch_area^2
0,6.0,2948.0,440.0,5.0,2.0,1910.0,6.0,1.0,0.0,0.0,4.0,0.0,0.0,172.0,36.0,17688.0,2640.0,30.0,12.0,11460.0,36.0,6.0,0.0,0.0,24.0,0.0,0.0,1032.0,8690704.0,1297120.0,14740.0,5896.0,5630680.0,17688.0,2948.0,0.0,0.0,11792.0,0.0,0.0,...,7640.0,0.0,0.0,328520.0,36.0,6.0,0.0,0.0,24.0,0.0,0.0,1032.0,1.0,0.0,0.0,4.0,0.0,0.0,172.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16.0,0.0,0.0,688.0,0.0,0.0,0.0,0.0,0.0,29584.0
1,5.0,3934.0,580.0,6.0,2.0,1977.0,8.0,3.0,0.0,0.0,3.0,0.0,0.0,170.0,25.0,19670.0,2900.0,30.0,10.0,9885.0,40.0,15.0,0.0,0.0,15.0,0.0,0.0,850.0,15476356.0,2281720.0,23604.0,7868.0,7777518.0,31472.0,11802.0,0.0,0.0,11802.0,0.0,0.0,...,5931.0,0.0,0.0,336090.0,64.0,24.0,0.0,0.0,24.0,0.0,0.0,1360.0,9.0,0.0,0.0,9.0,0.0,0.0,510.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,510.0,0.0,0.0,0.0,0.0,0.0,28900.0
2,7.0,2150.0,426.0,7.0,4.0,2006.0,11.0,2.0,4.0,0.0,5.0,0.0,554.0,124.0,49.0,15050.0,2982.0,49.0,28.0,14042.0,77.0,14.0,28.0,0.0,35.0,0.0,3878.0,868.0,4622500.0,915900.0,15050.0,8600.0,4312900.0,23650.0,4300.0,8600.0,0.0,10750.0,0.0,1191100.0,...,10030.0,0.0,1111324.0,248744.0,121.0,22.0,44.0,0.0,55.0,0.0,6094.0,1364.0,4.0,8.0,0.0,10.0,0.0,1108.0,248.0,16.0,0.0,20.0,0.0,2216.0,496.0,0.0,0.0,0.0,0.0,0.0,25.0,0.0,2770.0,620.0,0.0,0.0,0.0,306916.0,68696.0,15376.0
3,5.0,1936.0,480.0,7.0,1.0,1923.0,7.0,1.0,0.0,0.0,3.0,0.0,0.0,184.0,25.0,9680.0,2400.0,35.0,5.0,9615.0,35.0,5.0,0.0,0.0,15.0,0.0,0.0,920.0,3748096.0,929280.0,13552.0,1936.0,3722928.0,13552.0,1936.0,0.0,0.0,5808.0,0.0,0.0,...,5769.0,0.0,0.0,353832.0,49.0,7.0,0.0,0.0,21.0,0.0,0.0,1288.0,1.0,0.0,0.0,3.0,0.0,0.0,184.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,552.0,0.0,0.0,0.0,0.0,0.0,33856.0
4,6.0,2788.0,514.0,6.0,3.0,1963.0,8.0,2.0,4.0,247.0,4.0,0.0,609.0,261.0,36.0,16728.0,3084.0,36.0,18.0,11778.0,48.0,12.0,24.0,1482.0,24.0,0.0,3654.0,1566.0,7772944.0,1433032.0,16728.0,8364.0,5472844.0,22304.0,5576.0,11152.0,688636.0,11152.0,0.0,1697892.0,...,7852.0,0.0,1195467.0,512343.0,64.0,16.0,32.0,1976.0,32.0,0.0,4872.0,2088.0,4.0,8.0,494.0,8.0,0.0,1218.0,522.0,16.0,988.0,16.0,0.0,2436.0,1044.0,61009.0,988.0,0.0,150423.0,64467.0,16.0,0.0,2436.0,1044.0,0.0,0.0,0.0,370881.0,158949.0,68121.0


### Instantiate Model and Cross Validation Check on Linear Regression Model

In [28]:
# lr
lr = LinearRegression()

In [29]:
lr.fit(X_train_poly1,y_train)

LinearRegression()

In [30]:
cross_val_score(lr,X_train_poly1,y_train, cv=3).mean()

0.8858637344988631

### Predictions

In [31]:
predictions = lr.predict(X_predict_1)

In [32]:
predictions

array([135689.2362099 , 165319.31418208, 176068.5282847 , 106735.05069377,
       190708.82529906, 106815.18831226,  98293.27695045, 152727.54070087,
       189830.45054082, 163848.07555681, 164700.29466232, 118306.41035105,
       153363.5812461 , 284653.29568654, 144601.37985978, 114398.55977449,
       154720.80268255, 120197.8721531 , 181924.57120939, 212137.12807133,
       147747.22812058, 135297.66947387, 212662.46785225, 152461.14671574,
       178570.00796243, 116342.20289915, 123749.64818375,  94818.83125483,
       158256.98745335,  72829.56009611, 107292.07681543,  99570.39432959,
       242053.88187876, 148016.19184081, 214977.00447929, 158169.55334876,
       113100.47173819,  95176.70250144,  92374.24246228, 195602.6308837 ,
       155379.37185752, 208539.76611486, 137888.03210014, 156719.83612584,
       244800.11145295,  95405.3641017 , 223639.36746593, 122946.42987996,
       125769.62044313, 133961.79234597, 112980.30701937, 182149.71457727,
       259329.50428012, 1

In [33]:
test["SalePrice"] = list(predictions)

In [34]:
predictions_final = test.loc[:,["Id","SalePrice"]]

In [35]:
# Export csv for predicted y values
predictions_final.to_csv('../datasets/test_prediction_2.csv', index=False)

## Kaggle Score

The Kaggle score (RMSE) for the production model was 27029 (Private score: 24224)