In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error

In [2]:
amesData = pd.read_csv('train.csv')
kingData = pd.read_csv('kc_house_data.csv')

In [3]:
amesData.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [4]:
amesFeatures = ['LotArea', 'YearBuilt', 'YearRemodAdd', 'YrSold', 'TotalBsmtSF', 
                'GrLivArea', 'TotRmsAbvGrd', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
                'HalfBath', 'SalePrice']

In [5]:
# make a column that is Year sold - Year build/remodeled ==> gives relative age of building

In [6]:
amesFiltered = amesData.loc[:, amesFeatures]

In [7]:
# apply log transform on saleprice to normalize the data
amesFiltered['SalePrice'] = np.log1p(amesFiltered['SalePrice'])

In [8]:
amesFiltered['sqft_lot'] = amesFiltered['LotArea']

In [9]:
# Calculate price per LotArea AND price per TotalSF of the each property (GrLivArea + TotalBsmtSF)
amesFiltered['DollarPerArea'] =  amesFiltered['SalePrice'] / amesFiltered['sqft_lot']

In [10]:
amesFiltered['sqft_living'] = (amesFiltered['GrLivArea'] + amesFiltered['TotalBsmtSF']) 

In [11]:
amesFiltered['DollarPerSF'] = amesFiltered['SalePrice'] / amesFiltered['sqft_living']

In [12]:
amesFiltered['bedrooms'] = amesFiltered['TotRmsAbvGrd']

In [13]:
amesFiltered['bathrooms'] = amesFiltered['BsmtFullBath'] + amesFiltered['FullBath'] + (amesFiltered['BsmtHalfBath'] + amesFiltered['HalfBath']) / 2

In [14]:
amesFiltered['LastConstruction'] = abs(amesFiltered['YrSold'] - amesFiltered['YearRemodAdd'])

In [15]:
amesFiltered['BuildingAge'] = abs(amesFiltered['YrSold'] - amesFiltered['YearBuilt'])

In [16]:
# find the median of amesFiltered['DollarPerArea'] and amesFiltered['DollarPerSF']
# add this data as columns for the data frame
amesFiltered['MedianDollarPerArea'] = amesFiltered['DollarPerArea'] .median()
amesFiltered['MedianDollarPerSF'] = amesFiltered['DollarPerSF'] .median()

In [17]:
amesFiltered['SalePrice']

0       12.247699
1       12.109016
2       12.317171
3       11.849405
4       12.429220
          ...    
1455    12.072547
1456    12.254868
1457    12.493133
1458    11.864469
1459    11.901590
Name: SalePrice, Length: 1460, dtype: float64

In [18]:
amesFiltered.corr()['SalePrice']

LotArea                2.573201e-01
YearBuilt              5.865702e-01
YearRemodAdd           5.656078e-01
YrSold                -3.726291e-02
TotalBsmtSF            6.121342e-01
GrLivArea              7.009270e-01
TotRmsAbvGrd           5.344224e-01
BsmtFullBath           2.362242e-01
BsmtHalfBath          -5.149242e-03
FullBath               5.947707e-01
HalfBath               3.139822e-01
SalePrice              1.000000e+00
sqft_lot               2.573201e-01
DollarPerArea         -2.845571e-01
sqft_living            7.732772e-01
DollarPerSF           -6.897441e-01
bedrooms               5.344224e-01
bathrooms              6.730107e-01
LastConstruction      -5.681606e-01
BuildingAge           -5.872900e-01
MedianDollarPerArea   -8.498411e-14
MedianDollarPerSF      8.497074e-14
Name: SalePrice, dtype: float64

In [19]:
amesFeaturesFinal = ['MedianDollarPerArea', 'MedianDollarPerSF', 'sqft_lot', 'sqft_living', 'bedrooms', 'bathrooms', 'BuildingAge']

In [20]:
amesFeaturesData = amesFiltered.loc[:, amesFeaturesFinal]
amesFeaturesData

Unnamed: 0,MedianDollarPerArea,MedianDollarPerSF,sqft_lot,sqft_living,bedrooms,bathrooms,BuildingAge
0,0.001268,0.004845,8450,2566,8,3.5,5
1,0.001268,0.004845,9600,2524,6,2.5,31
2,0.001268,0.004845,11250,2706,6,3.5,7
3,0.001268,0.004845,9550,2473,7,2.0,91
4,0.001268,0.004845,14260,3343,9,3.5,8
...,...,...,...,...,...,...,...
1455,0.001268,0.004845,7917,2600,7,2.5,8
1456,0.001268,0.004845,13175,3615,7,3.0,32
1457,0.001268,0.004845,9042,3492,9,2.0,69
1458,0.001268,0.004845,9717,2156,5,2.0,60


In [21]:
lr = LinearRegression()

In [22]:
X_train, X_test, y_train, y_test = train_test_split(amesFeaturesData, amesFiltered['SalePrice'], test_size = 0.2, random_state = 0)
(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

((1168, 7), (292, 7), (1168,), (292,))

In [23]:
lr.fit(X_train, y_train)

LinearRegression()

In [24]:
scores = cross_val_score(lr, amesFeaturesData, amesFiltered['SalePrice'], cv=5, scoring='neg_root_mean_squared_error')
scores

array([-0.1837911 , -0.21468791, -0.18583975, -0.18747728, -0.26109104])

In [25]:
prediction = np.exp(lr.predict(X_test))
y_true = np.exp(y_test)

In [26]:
print("score: ", mean_squared_error(y_test, lr.predict(X_test))) # this isn't bad at all --> let's test on other data

score:  0.06979184385064718


In [27]:
kingData.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [28]:
kingFeatures = ['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'yr_built', 'yr_renovated']

In [29]:
kingFiltered = kingData.loc[:, kingFeatures]
kingFiltered

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,yr_built,yr_renovated
0,20141013T000000,221900.0,3,1.00,1180,5650,1955,0
1,20141209T000000,538000.0,3,2.25,2570,7242,1951,1991
2,20150225T000000,180000.0,2,1.00,770,10000,1933,0
3,20141209T000000,604000.0,4,3.00,1960,5000,1965,0
4,20150218T000000,510000.0,3,2.00,1680,8080,1987,0
...,...,...,...,...,...,...,...,...
21608,20140521T000000,360000.0,3,2.50,1530,1131,2009,0
21609,20150223T000000,400000.0,4,2.50,2310,5813,2014,0
21610,20140623T000000,402101.0,2,0.75,1020,1350,2009,0
21611,20150116T000000,400000.0,3,2.50,1600,2388,2004,0


In [30]:
# apply log transform on saleprice to normalize the data
kingFiltered['price'] = np.log1p(kingFiltered['price'])

In [31]:
kingFeaturesFinal = ['MedianDollarPerArea', 'MedianDollarPerSF', 'sqft_lot', 'sqft_living', 'bedrooms', 'bathrooms', 'BuildingAge']

In [32]:
kingFiltered['DollarPerArea'] = kingFiltered['sqft_lot'] / kingFiltered['price']
kingFiltered['DollarPerSF'] = kingFiltered['sqft_living'] / kingFiltered['price']

In [33]:
kingFiltered['YrSold'] = kingFiltered['date'].apply(lambda d: int(d[0:4]))
kingFiltered['LastConstruction'] = abs(kingFiltered['YrSold'] - kingFiltered['yr_renovated'])
kingFiltered['BuildingAge'] = abs(kingFiltered['YrSold'] - kingFiltered['yr_built'])

In [34]:
kingFiltered['MedianDollarPerArea'] = kingFiltered['DollarPerArea'] .median()
kingFiltered['MedianDollarPerSF'] = kingFiltered['DollarPerArea'] .median()

In [35]:
kingFeaturesData = kingFiltered.loc[:, kingFeaturesFinal]

In [36]:
scores2 = cross_val_score(lr, kingFeaturesData, kingFiltered['price'], cv=5, scoring='neg_root_mean_squared_error')
scores2

array([-0.36600477, -0.36551357, -0.35905838, -0.36872867, -0.35719581])

In [37]:
prediction2 = np.exp(lr.predict(kingFeaturesData))
y_true2 = np.exp(kingFiltered['price'])

In [38]:
print("score: ", mean_squared_error(kingFiltered['price'], lr.predict(kingFeaturesData))) 

score:  1.693944076206347
