In [85]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb
import sys
import sweetviz as sv
import sklearn

In [86]:
!{sys.executable} -m pip install sklearn



In [123]:
# Import data from csv
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

all_data = pd.concat([train_data,test_data],axis=0)
original_data = all_data.copy()

In [124]:
# Missing numerical data
display(all_data.select_dtypes(exclude=['object']).isnull().sum())

# Missing categorical data
display(all_data.select_dtypes(include=['object']).isnull().sum())

Id                  0
MSSubClass          0
LotFrontage       486
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MasVnrArea         23
BsmtFinSF1          1
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        2
BsmtHalfBath        2
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
GarageYrBlt       159
GarageCars          1
GarageArea          1
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SalePrice        1459
dtype: int64

MSZoning            4
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType         24
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           81
BsmtCond           82
BsmtExposure       82
BsmtFinType1       79
BsmtFinType2       80
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         1
Functional          2
FireplaceQu      1420
GarageType        157
GarageFinish      159
GarageQual        159
GarageCond        159
PavedDrive          0
PoolQC           2909
Fence            2348
MiscFeature      2814
SaleType            1
SaleCondition       0
dtype: int64

In [125]:
corr = train_data.corr()

In [126]:
# MasVnrArea null values

MVA_null = all_data[all_data['MasVnrArea'].isnull()]
MVT_null = all_data[all_data['MasVnrType'].isnull()]

# 23 out of 24 MasVnrType null values correspond to MasVnrArea null values
display(MVT_null["MasVnrArea"])

234       NaN
529       NaN
650       NaN
936       NaN
973       NaN
977       NaN
1243      NaN
1278      NaN
231       NaN
246       NaN
422       NaN
532       NaN
544       NaN
581       NaN
851       NaN
865       NaN
880       NaN
889       NaN
908       NaN
1132      NaN
1150    198.0
1197      NaN
1226      NaN
1402      NaN
Name: MasVnrArea, dtype: float64

In [127]:
# get ID for mismatched row
display(MVT_null[MVT_null["MasVnrArea"].isnull() == False]["Id"])

1150    2611
Name: Id, dtype: int64

In [128]:
# change MasVnrType for entry 2611 to second most frequent, as mode is "None"
MVT_mode = all_data[all_data["MasVnrType"] != "None"]["MasVnrType"].mode()[0]
all_data.loc[(all_data["Id"] == 2611),'MasVnrType'] = MVT_mode

# examine MasVnrArea values for houses with MasVnrType "None"

all_data[all_data['MasVnrType'] == "None"]["MasVnrArea"].value_counts()



0.0      1735
1.0         3
288.0       1
344.0       1
312.0       1
285.0       1
Name: MasVnrArea, dtype: int64

In [129]:
# Most are zero as expected, but there are some nonzero values. 
all_data[(all_data['MasVnrType'] == "None") & (all_data["MasVnrArea"] != 0)]

# Replace MasVnrType to second most frequent
all_data.loc[((all_data['MasVnrType'] == "None") & (all_data["MasVnrArea"] != 0)),'MasVnrType'] = MVT_mode

In [130]:
# Set null MasVnrType to "None" and corresponding MasVnrArea to 0
all_data["MasVnrType"].fillna("None", inplace = True)
all_data["MasVnrArea"].fillna(0, inplace = True)

In [131]:
# Missing numerical data
display(all_data.select_dtypes(exclude=['object']).isnull().sum())

# Missing categorical data
display(all_data.select_dtypes(include=['object']).isnull().sum())

Id                  0
MSSubClass          0
LotFrontage       486
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MasVnrArea          0
BsmtFinSF1          1
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        2
BsmtHalfBath        2
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
GarageYrBlt       159
GarageCars          1
GarageArea          1
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SalePrice        1459
dtype: int64

MSZoning            4
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType          0
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           81
BsmtCond           82
BsmtExposure       82
BsmtFinType1       79
BsmtFinType2       80
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         1
Functional          2
FireplaceQu      1420
GarageType        157
GarageFinish      159
GarageQual        159
GarageCond        159
PavedDrive          0
PoolQC           2909
Fence            2348
MiscFeature      2814
SaleType            1
SaleCondition       0
dtype: int64

In [132]:
#GarageYrBlt null values

GYT_null = all_data[all_data["GarageYrBlt"].isnull()]
GYT_not_null = all_data[all_data["GarageYrBlt"].isnull()==False]

#GarageYrBlt null values correspond almost exactly to null values of GarageType, GarageFinish, GarageQual, GarageCond\

all_data[all_data["GarageYrBlt"].isnull() & all_data["GarageType"].isnull() & all_data["GarageFinish"].isnull() & all_data["GarageQual"].isnull() & all_data["GarageCond"].isnull()] 

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
39,40,90,RL,65.0,6040,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2008,WD,AdjLand,82000.0
48,49,190,RM,33.0,4456,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2009,New,Partial,113000.0
78,79,90,RL,72.0,10778,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2010,WD,Normal,136500.0
88,89,50,C (all),105.0,8470,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,,0,10,2009,ConLD,Abnorml,85000.0
89,90,20,RL,60.0,8070,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,123600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1433,2894,50,C (all),60.0,8520,Grvl,,Reg,Bnk,AllPub,...,0,,,,0,4,2006,WD,Normal,
1449,2910,180,RM,21.0,1470,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2006,WD,Normal,
1453,2914,160,RM,21.0,1526,Pave,,Reg,Lvl,AllPub,...,0,,GdPrv,,0,6,2006,WD,Normal,
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,,,,0,6,2006,WD,Normal,


In [133]:
# Missing numerical data
display(all_data.select_dtypes(exclude=['object']).isnull().sum())

# Missing categorical data
display(all_data.select_dtypes(include=['object']).isnull().sum())

Id                  0
MSSubClass          0
LotFrontage       486
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MasVnrArea          0
BsmtFinSF1          1
BsmtFinSF2          1
BsmtUnfSF           1
TotalBsmtSF         1
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        2
BsmtHalfBath        2
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
GarageYrBlt       159
GarageCars          1
GarageArea          1
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SalePrice        1459
dtype: int64

MSZoning            4
Street              0
Alley            2721
LotShape            0
LandContour         0
Utilities           2
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
RoofStyle           0
RoofMatl            0
Exterior1st         1
Exterior2nd         1
MasVnrType          0
ExterQual           0
ExterCond           0
Foundation          0
BsmtQual           81
BsmtCond           82
BsmtExposure       82
BsmtFinType1       79
BsmtFinType2       80
Heating             0
HeatingQC           0
CentralAir          0
Electrical          1
KitchenQual         1
Functional          2
FireplaceQu      1420
GarageType        157
GarageFinish      159
GarageQual        159
GarageCond        159
PavedDrive          0
PoolQC           2909
Fence            2348
MiscFeature      2814
SaleType            1
SaleCondition       0
dtype: int64

In [134]:
# Examine the mismatched rows
display(all_data[(all_data["GarageFinish"].isnull()) & (all_data["GarageType"].isnull() == False)][["Id","GarageArea","GarageCars","GarageType","GarageFinish","GarageQual","GarageCond"]])

# Clean up these entries
garage_columns = ["Id","GarageArea","GarageCars","GarageType","GarageFinish","GarageQual","GarageCond"]
Detchd_GF_mode = all_data[all_data["GarageType"] == "Detchd"]["GarageFinish"].mode()[0]
Detchd_GQ_mode = all_data[all_data["GarageType"] == "Detchd"]["GarageQual"].mode()[0]
Detchd_GCo_mode = all_data[all_data["GarageType"] == "Detchd"]["GarageCond"].mode()[0]
Detchd_GA_mode = all_data[all_data["GarageType"] == "Detchd"]["GarageArea"].mode()[0]
Detchd_GCa_mode = all_data[all_data["GarageType"] == "Detchd"]["GarageCars"].mode()[0]
all_data.loc[(all_data["Id"].isin((2127,2577))),"GarageFinish"] = Detchd_GF_mode
all_data.loc[(all_data["Id"].isin((2127,2577))),"GarageQual"] = Detchd_GQ_mode
all_data.loc[(all_data["Id"].isin((2127,2577))),"GarageCond"] = Detchd_GCo_mode
all_data.loc[(all_data["Id"] == 2577),"GarageArea"] = Detchd_GA_mode
all_data.loc[(all_data["Id"] == 2577),"GarageCars"] = Detchd_GCa_mode

# # Re-examine the mismatched rows
all_data.loc[(all_data["Id"].isin((2127,2577))), garage_columns]

Unnamed: 0,Id,GarageArea,GarageCars,GarageType,GarageFinish,GarageQual,GarageCond
666,2127,360.0,1.0,Detchd,,,
1116,2577,,,Detchd,,,


Unnamed: 0,Id,GarageArea,GarageCars,GarageType,GarageFinish,GarageQual,GarageCond
666,2127,360.0,1.0,Detchd,Unf,TA,TA
1116,2577,576.0,2.0,Detchd,Unf,TA,TA


In [135]:
# GarageYrBlt is highly correlated with YrBlt. 
corr["GarageYrBlt"]["YearBuilt"]

# Drop GarageYrBlt
all_data = all_data.drop("GarageYrBlt", axis=1)

# GarageCars and GarageArea are highly correlated and have similar correlation with target. 
# Drop GarageCars
all_data = all_data.drop("GarageCars", axis=1)

In [136]:
# Set null values for Alley, PoolQC, Fence, MiscFeature, GarageType, GarageFinish, GarageQual, GarageCond, FireplaceQu to NA
all_data["Alley"].fillna("NA", inplace = True)
all_data["PoolQC"].fillna("NA", inplace = True)
all_data["Fence"].fillna("NA", inplace = True)
all_data["MiscFeature"].fillna("NA", inplace = True)
all_data["GarageType"].fillna("NA", inplace = True)
all_data["GarageFinish"].fillna("NA", inplace = True)
all_data["GarageQual"].fillna("NA", inplace = True)
all_data["GarageCond"].fillna("NA", inplace = True)
all_data["FireplaceQu"].fillna("NA", inplace = True)

In [137]:
# Assume basement related null vales have no basement
all_data["BsmtQual"].fillna("Na", inplace = True)
all_data["BsmtCond"].fillna("Na", inplace = True)
all_data["BsmtExposure"].fillna("Na", inplace = True)
all_data["BsmtFinType1"].fillna("Na", inplace = True)
all_data["BsmtFinType2"].fillna("Na", inplace = True)

all_data["BsmtFinSF1"].fillna(0, inplace = True)
all_data["BsmtFinSF2"].fillna(0, inplace = True)
all_data["BsmtUnfSF"].fillna(0, inplace = True)
all_data["TotalBsmtSF"].fillna(0, inplace = True)
all_data["BsmtFullBath"].fillna(0, inplace = True)
all_data["BsmtHalfBath"].fillna(0, inplace = True)

In [138]:
# Missing numerical data
display(all_data.select_dtypes(exclude=['object']).isnull().sum())

# Missing categorical data
display(all_data.select_dtypes(include=['object']).isnull().sum())

Id                  0
MSSubClass          0
LotFrontage       486
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MasVnrArea          0
BsmtFinSF1          0
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        0
BsmtHalfBath        0
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
GarageArea          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SalePrice        1459
dtype: int64

MSZoning         4
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        2
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      1
Exterior2nd      1
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       1
KitchenQual      1
Functional       2
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
PoolQC           0
Fence            0
MiscFeature      0
SaleType         1
SaleCondition    0
dtype: int64

In [139]:
# Set all other categorical nulls to mode
all_data["MSZoning"].fillna(all_data["MSZoning"].mode()[0], inplace=True)
all_data["Utilities"].fillna(all_data["Utilities"].mode()[0], inplace=True)
all_data["Exterior1st"].fillna(all_data["Exterior1st"].mode()[0], inplace=True)
all_data["Exterior2nd"].fillna(all_data["Exterior2nd"].mode()[0], inplace=True)
all_data["Electrical"].fillna(all_data["Electrical"].mode()[0], inplace=True)
all_data["KitchenQual"].fillna(all_data["KitchenQual"].mode()[0], inplace=True)
all_data["Functional"].fillna(all_data["Functional"].mode()[0], inplace=True)
all_data["SaleType"].fillna(all_data["SaleType"].mode()[0], inplace=True)

In [140]:
# Examine 486 LotFrontage Null values
LF_null = all_data[all_data['LotFrontage'].isnull()]
LF_not_null = all_data[all_data['LotFrontage'].isnull() == False]

# Means of LotFrontage grouped by LotConfig
LF_not_null.groupby('LotConfig')['LotFrontage'].mean()

LotConfig
Corner     82.690418
CulDSac    56.775281
FR2        61.276923
FR3        79.300000
Inside     67.205693
Name: LotFrontage, dtype: float64

In [141]:
# Only 13 out of 486 null values have alleys attached
LF_null["Alley"].value_counts()

NA      473
Pave      7
Grvl      6
Name: Alley, dtype: int64

In [142]:
# Most are single family homes
LF_null.groupby('BldgType').size()

BldgType
1Fam      423
2fmCon      3
Duplex     17
Twnhs       5
TwnhsE     38
dtype: int64

In [143]:
# Most are Residential Low Density
LF_null.groupby('MSZoning').size()

# Indicates that they should have nonzero lot frontage

MSZoning
C (all)      2
FV          20
RH           3
RL         427
RM          34
dtype: int64

In [144]:
# Impute missing LotFrontage values using Regression on MSSubClass, MSZoning, LotArea, LotShape, LotConfig
from sklearn.linear_model import LinearRegression

LF_train = LF_not_null[['LotFrontage', 'MSZoning', 'LotArea', 'LotShape', 'LotConfig']]
LF_train = pd.get_dummies(LF_train, drop_first=True)
LF_train_x = LF_train.drop(['LotFrontage'], axis=1)
LF_train_x.insert(8, "LotConfig_Corner", [0]*LF_train_x.shape[0])
LF_train_x.insert(5, "LotShape_IR1", [0]*LF_train_x.shape[0])
LF_train_x.insert(5, "MSZoning_C (all)", [0]*LF_train_x.shape[0])
LF_train_y = LF_train[['LotFrontage']]
LF_test = LF_null[['MSZoning', 'LotArea', 'LotShape', 'LotConfig']]
LF_test = pd.get_dummies(LF_test)
LF_features = list(LF_train_x.columns)
model = LinearRegression()
model.fit(LF_train_x,LF_train_y)
print(list(zip(LF_features ,np.around(model.coef_[0],2))))
LF_imputed_values = model.predict(LF_test)
pointer = 0
for i in range(len(all_data)):
    if pd.isnull(all_data.iloc[i]["LotFrontage"]):
        all_data.LotFrontage.iloc[i]  = LF_imputed_values[pointer][0]
        pointer += 1


[('LotArea', 0.0), ('MSZoning_FV', -0.71), ('MSZoning_RH', -6.09), ('MSZoning_RL', 8.1), ('MSZoning_RM', -7.7), ('MSZoning_C (all)', 0.0), ('LotShape_IR1', -0.0), ('LotShape_IR2', -4.23), ('LotShape_IR3', 0.98), ('LotShape_Reg', -3.49), ('LotConfig_Corner', 0.0), ('LotConfig_CulDSac', -37.47), ('LotConfig_FR2', -21.35), ('LotConfig_FR3', 2.59), ('LotConfig_Inside', -13.81)]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [145]:
# Missing numerical data
display(all_data.select_dtypes(exclude=['object']).isnull().sum())

# Missing categorical data
display(all_data.select_dtypes(include=['object']).isnull().sum())

Id                  0
MSSubClass          0
LotFrontage         0
LotArea             0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
MasVnrArea          0
BsmtFinSF1          0
BsmtFinSF2          0
BsmtUnfSF           0
TotalBsmtSF         0
1stFlrSF            0
2ndFlrSF            0
LowQualFinSF        0
GrLivArea           0
BsmtFullBath        0
BsmtHalfBath        0
FullBath            0
HalfBath            0
BedroomAbvGr        0
KitchenAbvGr        0
TotRmsAbvGrd        0
Fireplaces          0
GarageArea          0
WoodDeckSF          0
OpenPorchSF         0
EnclosedPorch       0
3SsnPorch           0
ScreenPorch         0
PoolArea            0
MiscVal             0
MoSold              0
YrSold              0
SalePrice        1459
dtype: int64

MSZoning         0
Street           0
Alley            0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
FireplaceQu      0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
PoolQC           0
Fence            0
MiscFeature      0
SaleType         0
SaleCondition    0
dtype: int64

In [146]:
all_data = pd.get_dummies(all_data)
all_data.drop(['Id'],axis=1,inplace=True)


In [147]:
train_df = all_data.iloc[:len(train_data),:]
test_df = all_data.iloc[len(train_data):,:]
test_df = test_df.drop(["SalePrice"],axis=1)

In [148]:
from sklearn.model_selection import train_test_split
all_train_x = train_df.drop(["SalePrice"],axis=1)
all_train_y = train_df["SalePrice"]
x_train, x_test, y_train, y_test = train_test_split(all_train_x, all_train_y, test_size=0.2, random_state=0)

In [149]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train[x_train.columns] = scaler.fit_transform(x_train)
x_test[x_test.columns] = scaler.transform(x_test)
y_train = y_train.astype(int)
y_test = y_test.astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[col] = igetitem(value, i)


In [150]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(x_train, y_train)

feature_imp = pd.DataFrame(model.feature_importances_, index=all_train_x.columns, columns=["importance"])
features = feature_imp.sort_values("importance", ascending=False).head(20).index
features = list(features)



In [151]:
# GrLivArea is correlated with TotRmsAbvGr and 2ndFlrSF
features.remove("TotRmsAbvGrd")
features.remove("2ndFlrSF")

# 1stFlSF is correlated with TotalBsmtSF
features.remove("TotalBsmtSF")


In [152]:
features

['1stFlrSF',
 'LotArea',
 'GrLivArea',
 'GarageArea',
 'BsmtUnfSF',
 'LotFrontage',
 'YearBuilt',
 'MoSold',
 'BsmtFinSF1',
 'YearRemodAdd',
 'OpenPorchSF',
 'WoodDeckSF',
 'YrSold',
 'MasVnrArea',
 'OverallQual',
 'BedroomAbvGr',
 'OverallCond']

In [153]:
x_train_1 = x_train[features]
x_test_1 = x_test[features]


In [154]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
model = LinearRegression()
model.fit(x_train_1,y_train)
coeffs = pd.DataFrame(model.coef_, x_train_1.columns, columns=['Coefficient'])
train_preds = model.predict(x_train_1)
test_preds = model.predict(x_test_1)
print('train error is ', model.score(x_train_1,y_train))
print('test error is ', model.score(x_test_1,y_test))

print('Root MSE:', np.sqrt(mean_squared_error(train_preds,y_train)))
print('Root MSE:', np.sqrt(mean_squared_error(test_preds,y_test)))

train error is  0.8346364059550262
test error is  0.6180577387376862
Root MSE: 31908.260137923375
Root MSE: 51357.92004345377


In [155]:
test_df[test_df.columns] =  scaler.transform(test_df)


In [160]:
test_df = test_df[features]
final_test_preds = model.predict(test_df)


In [170]:
final_test_preds = final_test_preds.round(decimals=2)


In [171]:
final_test_preds

array([128510.62, 179484.05, 181088.79, ..., 187678.47, 110384.45,
       240336.94])

In [172]:
result_df = pd.DataFrame(list(zip([i for i in range(1461,2920)],final_test_preds)),columns=['Id','SalePrice'])


In [177]:
result_df.to_csv(r'C:\Users\tdevanur\Desktop\results.csv', index = False, header=True)