In [1]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# import data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

# split labels from train data
labels = train['SalePrice']

In [3]:
# display 1000 columns for each dataframe
pd.set_option('display.max_columns', 1000)

In [6]:
# Find correlations between features and label
cor = train.corr()
correlation = cor['SalePrice'].sort_values()

EnclosedPorchArea      -0.146201
Kitchens               -0.109021
OverallRating          -0.083872
BuildingClass          -0.059458
BsmtHalfBaths          -0.039713
LowQualityFinishArea   -0.037948
YearSold               -0.023300
Misc                   -0.014690
ID                     -0.014099
BsmtFinish2Area        -0.007454
MonthSold               0.018922
PoolSize                0.022977
SeasonPorchArea         0.034537
Bedrooms                0.137496
ScreenPorchArea         0.138364
BsmtUnfinishedArea      0.158342
LotSize                 0.246017
BsmtFullBaths           0.275107
2ndFloorArea            0.280535
HalfBaths               0.305198
WoodDeckArea            0.306614
OpenPorchArea           0.309933
LotFrontage             0.336060
BsmtFinish1Area         0.434104
Fireplaces              0.475965
TotalRooms              0.493710
GarageYearBuilt         0.524158
YearRemodelled          0.536047
MasVnrArea              0.542733
FullBaths               0.544482
YearBuilt 

In [7]:
# appending train and test data so that we can clean them together
data = train.append(test)

In [62]:
# delete features that have more than 80% empty value
empty=data.isnull().sum().sort_values(ascending=False)
ratio=empty/len(data)
ratio = pd.DataFrame(ratio)
ratio.columns = ['empty ratio']
ratio = ratio[ratio['empty ratio']>0]
data = data.drop(ratio[ratio['empty ratio'] > 0.8].index,axis = 1)

In [63]:
# convert nan into none in time, type and quality features
feature1 = ["FireplaceQuality", "GarageCondition","GarageQuality", "GarageType",'BsmtExposure','BsmtFinishType2',\
            'BsmtFinishType1', "GarageYearBuilt",'GarageFinish', "MasVnrType",'BsmtCondition',"BsmtHeight"]
for i in feature1:
    data[i] = data[i].fillna('None')

In [64]:
# convert nan into 0 in features related to numbers
feature12 = ["GarageCars",'MasVnrArea','BsmtHalfBaths','BsmtFullBaths','GarageArea','BsmtFinish1Area',\
            'BsmtFinish2Area','BsmtUnfinishedArea','TotalBsmtArea']
for i in feature12:
    data[i] = data[i].fillna(0)

In [65]:
# convert nan into mean value in LotFrontage 
m = data['LotFrontage'].mean()
data['LotFrontage'] = data['LotFrontage'].fillna(m)

In [66]:
# drop features that are not closely correlated with sale price
d =['Foundation','MasVnrType','RoofType','Functional','Electrical','ProximityToMainRoad','TypeOfRoadAccess',\
'Shape','Flatness','HeatingType','SlopeOfProperty','Exterior2','GarageType','ProximityToMainRoad2',\
'Lot Config','SaleType','RoofMaterial','Exterior1','GarageFinish','ID']
data.drop(d, axis = 1, inplace = True)

In [73]:
# map categorical feature values to numerical values manually for special feature values
data['Paved Drive'] = data['Paved Drive'].map({'N':1, 'P':2, 'Y':3})
data['BsmtExposure'] = data['BsmtExposure'].map({'None':1, 'No':2, 'Mn':3, 'Av':4, 'Gd':5})
data['KitchenQuality'] = data['KitchenQuality'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

data['Neighborhood'] = data['Neighborhood'].map({'MeadowV':1,'IDOTRR':2, 'BrDale':2,'OldTown':3, 'Edwards':3, 'BrkSide':3,\
                                'Sawyer':4, 'Blueste':4, 'SWISU':4, 'NAmes':4,'NPkVill':5, 'Mitchel':5,\
                                'SawyerW':6, 'Gilbert':6, 'NWAmes':6,'Blmngtn':7, 'CollgCr':7, 'ClearCr':7, \
                                'Crawfor':7,'Veenker':8, 'Somerst':8, 'Timber':8,'Greens':8,'StoneBr':9,'NoRidge':10, 'NridgHt':10,\
                                 'GrnHill':11,'Landmrk':12})

data['HeatingQuality'] = data['HeatingQuality'].map({'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5})

data['ExteriorQual'] = data['ExteriorQual'].map({'Fa':1, 'TA':2, 'Gd':3, 'Ex':4})
data['UtilitiesAvailable'] = data['UtilitiesAvailable'].map({'NoSewr':1, 'NoSeWa':1, 'AllPub':2})
data['FireplaceQuality'] = data['FireplaceQuality'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['BsmtCondition'] = data['BsmtCondition'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['BsmtHeight'] = data['BsmtHeight'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['BsmtFinishType1'] = data['BsmtFinishType1'].map({'None':1, 'Unf':2, 'LwQ':3, 'Rec':4, 'BLQ':5, 'ALQ':6, 'GLQ':7})
data['BsmtFinishType2'] = data['BsmtFinishType2'].map({'None':1, 'Unf':2, 'LwQ':3, 'Rec':4, 'BLQ':5, 'ALQ':6, 'GLQ':7})
data['ExteriorCond'] = data['ExteriorCond'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['GarageCondition'] = data['GarageCondition'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['ZoningClass'] = data['ZoningClass'].map({'C (all)':1,'A (agr)':1,'I (all)':1, 'RM':2, 'RH':2, 'RL':3, 'FV':4})

data['HouseStyle'] = data['HouseStyle'].map({'1.5Unf':1,'1.5Fin':2, '2.5Unf':2, 'SFoyer':2, '1Story':3, 'SLvl':3,\
                                           '2Story':4, '2.5Fin':4})
data['GarageQuality'] = data['GarageQuality'].map({'None':1, 'Po':2, 'Fa':3, 'TA':4, 'Gd':5, 'Ex':6})
data['Central Air'] = data['Central Air'].map({'N':1,'Y':2})
    
data['SaleCondition'] = data['SaleCondition'].map({'Abnorml':1, 'Partial':2, 'Normal':3, 'Family':4, 'Alloca':5, 'AdjLand':6})
data['HouseType'] = data['HouseType'].map({'1Fam':1, 'Partial':2, 'TwnhsE':3, 'Duplex':4, 'Twnhs':5, '2fmCon':6})


In [74]:
# map categorical feature values to numerical values manually for other feature values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in ['YearSold','MonthSold','YearBuilt','YearRemodelled','GarageYearBuilt']:
    data[i] = data[i].astype(str)
    data[i] = le.fit_transform(data[i])

In [75]:
# split train and test data, then split train data again to get new train and validation data
train_data = data.iloc[0:2051]
test_data = data.iloc[2051:]
train_label = train_data['SalePrice']
train_data_sub, val_data, train_label_sub, val_label = train_test_split(train_data,train_label,test_size=0.3)
train_data_sub.drop(['SalePrice'], axis = 1, inplace = True)
val_data.drop(['SalePrice'], axis = 1, inplace = True)
test_data.drop(['SalePrice'], axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [80]:
# linear regression
from sklearn import linear_model
li = linear_model.LinearRegression()
li.fit(train_data_sub,train_label_sub)
y_predicted = li.predict(val_data)
li.score(val_data,val_label)

0.8649153416348592

In [88]:
# lasso regression
la = linear_model.Lasso()
la.fit(train_data_sub,train_label_sub)
y_predicted = la.predict(val_data)
la.score(val_data,val_label)

  model = cd_fast.enet_coordinate_descent(


0.8648587867353876

In [87]:
# elastic net regression
en = linear_model.ElasticNet()
en.fit(train_data_sub,train_label_sub)
y_predicted = en.predict(val_data)
en.score(val_data,val_label)

  model = cd_fast.enet_coordinate_descent(


0.8568834099250598

In [84]:
# random forest regression
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()
rf.fit(train_data_sub,train_label_sub)
y_predicted = rf.predict(val_data)
rf.score(val_data,val_label)

0.8921713716674735

In [134]:
rf = RandomForestRegressor(max_features=20,max_depth=20,ccp_alpha=0.9)
rf.fit(train_data_sub,train_label_sub)
y_predicted = rf.predict(val_data)
rf.score(val_data,val_label)

0.8971621465343372

In [84]:
# create submission file
sub = pd.DataFrame()
test_ID = test['ID']
sub['Id'] = test_ID
sub['SalePrice'] = y_predicted
sub.to_csv('baseline_submission.csv',index=False)