In [125]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import math
import statistics

pd.set_option('display.max_colum', None)
pd.set_option('display.max_rows', 100)

In [126]:
def calculate_scores(y_test, y_pred):
    print('R2:', round(r2_score(y_test, y_pred), 3))
    print('MSE:', round(mean_squared_error(y_test, y_pred), 3))
    print('RMSE:', round(math.sqrt(mean_squared_error(y_test, y_pred)), 3))
    return [round(r2_score(y_test, y_pred), 3), round(mean_squared_error(y_test, y_pred), 3), round(math.sqrt(mean_squared_error(y_test, y_pred)), 3)]

In [127]:
train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('house-prices-advanced-regression-techniques/test.csv')

In [128]:
len(train)

1460

In [129]:
len(test)

1459

In [130]:
train.columns

Index(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive

In [131]:
len(train.columns)

81

In [132]:
train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000


In [133]:
(train.isnull().sum()/len(train)*100).sort_values(ascending=False)

PoolQC           99.520548
MiscFeature      96.301370
Alley            93.767123
Fence            80.753425
FireplaceQu      47.260274
LotFrontage      17.739726
GarageYrBlt       5.547945
GarageCond        5.547945
GarageType        5.547945
GarageFinish      5.547945
GarageQual        5.547945
BsmtFinType2      2.602740
BsmtExposure      2.602740
BsmtQual          2.534247
BsmtCond          2.534247
BsmtFinType1      2.534247
MasVnrArea        0.547945
MasVnrType        0.547945
Electrical        0.068493
Id                0.000000
Functional        0.000000
Fireplaces        0.000000
KitchenQual       0.000000
KitchenAbvGr      0.000000
BedroomAbvGr      0.000000
HalfBath          0.000000
FullBath          0.000000
BsmtHalfBath      0.000000
TotRmsAbvGrd      0.000000
GarageCars        0.000000
GrLivArea         0.000000
GarageArea        0.000000
PavedDrive        0.000000
WoodDeckSF        0.000000
OpenPorchSF       0.000000
EnclosedPorch     0.000000
3SsnPorch         0.000000
S

In [134]:
train.dropna(axis=1, inplace=True)

In [135]:
data = train[['LotArea', 'YearBuilt', 'RoofStyle', 'Exterior1st', 'Foundation', 'Heating', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath','HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'GarageCars', 'GarageArea', 'PoolArea', 'SaleCondition', 'SalePrice']]

In [136]:
catcolumns = ['YearBuilt', 'RoofStyle', 'Exterior1st', 'Foundation', 'Heating', 'SaleCondition']

In [137]:
le = {}
for x in catcolumns:
    le[x] = LabelEncoder()
    data[x] = le[x].fit_transform(data[x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[x] = le[x].fit_transform(data[x])


In [138]:
data

Unnamed: 0,LotArea,YearBuilt,RoofStyle,Exterior1st,Foundation,Heating,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,GarageCars,GarageArea,PoolArea,SaleCondition,SalePrice
0,8450,104,1,12,2,1,1710,1,0,2,1,3,1,2,548,0,4,208500
1,9600,77,1,8,1,1,1262,0,1,2,0,3,1,2,460,0,4,181500
2,11250,102,1,12,2,1,1786,1,0,2,1,3,1,2,608,0,4,223500
3,9550,19,1,13,0,1,1717,1,0,1,0,3,1,3,642,0,0,140000
4,14260,101,1,12,2,1,2198,1,0,2,1,4,1,3,836,0,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,100,1,12,2,1,1647,0,0,2,1,3,1,2,460,0,4,175000
1456,13175,79,1,9,1,1,2073,1,0,2,0,3,1,2,500,0,4,210000
1457,9042,44,1,5,4,1,2340,0,0,2,0,4,1,1,252,0,4,266500
1458,9717,51,3,8,1,1,1078,1,0,1,0,2,1,1,240,0,4,142125


In [139]:
X = data[data.columns[:-1]].values
y = data[data.columns[-1]].values

In [140]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [141]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
calculate_scores(y_test, y_pred)

R2: 0.768
MSE: 1778408511.546
RMSE: 42171.181


[0.768, 1778408511.546, 42171.181]

In [142]:
dt = DecisionTreeRegressor()
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
calculate_scores(y_test, y_pred)

R2: 0.65
MSE: 2681004176.185
RMSE: 51778.414


[0.65, 2681004176.185, 51778.414]

In [143]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
calculate_scores(y_test, y_pred)

R2: 0.851
MSE: 1139141967.35
RMSE: 33751.177


[0.851, 1139141967.35, 33751.177]

In [144]:
svm = SVR()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
calculate_scores(y_test, y_pred)

R2: -0.025
MSE: 7859309435.875
RMSE: 88652.746


[-0.025, 7859309435.875, 88652.746]

In [145]:
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
calculate_scores(y_test, y_pred)

R2: 0.583
MSE: 3199634873.375
RMSE: 56565.315


[0.583, 3199634873.375, 56565.315]

---

In [146]:
kf = KFold(n_splits=5)

In [147]:
results = []
for train, test in kf.split(X,y):
    rf = RandomForestRegressor()
    rf.fit(X[train], y[train])
    y_pred = rf.predict(X[test])
    rs = calculate_scores(y[test], y_pred)
    results.append(rs)
    print('\n')

R2: 0.868
MSE: 729213595.827
RMSE: 27003.955


R2: 0.727
MSE: 1791558726.458
RMSE: 42326.809


R2: 0.836
MSE: 1241730969.909
RMSE: 35238.203


R2: 0.843
MSE: 795576523.715
RMSE: 28205.966


R2: 0.757
MSE: 1630537253.697
RMSE: 40379.912




In [148]:
statistics.mean([x[0] for x in results]), statistics.mean([x[1] for x in results]), statistics.mean([x[2] for x in results])

(0.8062, 1237723413.9212, 34630.969)

In [149]:
st

NameError: name 'st' is not defined

In [162]:
train = pd.read_csv('house-prices-advanced-regression-techniques/train.csv')

In [164]:
train['MSZoning'].value_counts()

RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

In [166]:
le = LabelEncoder()

In [167]:
train['MSZoning_le'] = le.fit_transform(train['MSZoning'])

In [168]:
train['MSZoning_le'].value_counts()

3    1151
4     218
1      65
2      16
0      10
Name: MSZoning_le, dtype: int64

In [169]:
le.classes_

array(['C (all)', 'FV', 'RH', 'RL', 'RM'], dtype=object)

---

In [175]:
pd.concat([data, pd.get_dummies(train['MSZoning'])], axis=1)

Unnamed: 0,LotArea,YearBuilt,RoofStyle,Exterior1st,Foundation,Heating,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,GarageCars,GarageArea,PoolArea,SaleCondition,SalePrice,C (all),FV,RH,RL,RM
0,8450,104,1,12,2,1,1710,1,0,2,1,3,1,2,548,0,4,208500,0,0,0,1,0
1,9600,77,1,8,1,1,1262,0,1,2,0,3,1,2,460,0,4,181500,0,0,0,1,0
2,11250,102,1,12,2,1,1786,1,0,2,1,3,1,2,608,0,4,223500,0,0,0,1,0
3,9550,19,1,13,0,1,1717,1,0,1,0,3,1,3,642,0,0,140000,0,0,0,1,0
4,14260,101,1,12,2,1,2198,1,0,2,1,4,1,3,836,0,4,250000,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,7917,100,1,12,2,1,1647,0,0,2,1,3,1,2,460,0,4,175000,0,0,0,1,0
1456,13175,79,1,9,1,1,2073,1,0,2,0,3,1,2,500,0,4,210000,0,0,0,1,0
1457,9042,44,1,5,4,1,2340,0,0,2,0,4,1,1,252,0,4,266500,0,0,0,1,0
1458,9717,51,3,8,1,1,1078,1,0,1,0,2,1,1,240,0,4,142125,0,0,0,1,0


In [171]:
train['MSZoning']

0       RL
1       RL
2       RL
3       RL
4       RL
        ..
1455    RL
1456    RL
1457    RL
1458    RL
1459    RL
Name: MSZoning, Length: 1460, dtype: object