# Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random
import warnings
warnings.filterwarnings('ignore')
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV 
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from scipy.stats import norm, skew

# Loading data

In [None]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
df = pd.concat([train,test])

In [None]:
df.head()

In [None]:
df.info()

# EDA 

Checking the skewness of data

In [None]:
#skewness and kurtosis
print("Skewness: %f" % train['SalePrice'].skew())
print("Kurtosis: %f" % train['SalePrice'].kurt())

In [None]:
sns.distplot(train['SalePrice'], color= 'purple',fit=norm);
plt.xticks(rotation=90);
plt.show()

The data is right-skewed which may affect our model.
More info: https://towardsdatascience.com/skewed-data-a-problem-to-your-statistical-model-9a6b5bb74e37

Making log transformation to achieve normal distribution

In [None]:
train.SalePrice = np.log(train.SalePrice)
df.SalePrice = np.log(df.SalePrice)

In [None]:
sns.distplot(train['SalePrice'], color= 'purple', fit=norm);
plt.show()
print("Skewness: %f" % train['SalePrice'].skew())
print("Kurtosis: %f" % train['SalePrice'].kurt())

In [None]:
#saleprice correlation matrix

cols = train.corr().nlargest(10, 'SalePrice')['SalePrice'].index
cols = np.array(list(reversed(cols)))
cm = train[cols].corr()
mask = np.triu(np.ones_like(cm, dtype=np.bool))

f, ax = plt.subplots(figsize=(12, 9))
g1 = sns.heatmap(cm,cmap='mako_r',fmt='.2f', annot=True, mask=mask)
g1.set_xticklabels(g1.get_xticklabels(),rotation=70,fontsize=10)
g1.set_yticklabels(g1.get_yticklabels(),rotation=15,fontsize=10)
plt.title("Triangle Correlation Plot of Features")
plt.show()

Conclusions:
1. All of those features have influence on Sale Price
2. Pairs: GrLivArea and TotRmsAbvGrd, TotalBsmtSF and 1stFlrSF, GarageCars and GarageArea are higly correlated with each other
3. Heatmap does not include categorical data which we will look after in next steps

In [None]:
sns.pairplot(train[["GrLivArea","TotRmsAbvGrd","SalePrice"]])
plt.show()

GrLivArea seems to be explain slightly better than TotRmsAbvGrd that is why we will consider reducing number of features by dropping TotRmsAbvGrd.

In [None]:
sns.pairplot(train[["TotalBsmtSF","1stFlrSF","SalePrice"]])
plt.show()

In [None]:
sns.pairplot(train[["GarageCars","GarageArea","SalePrice"]])
plt.show()

Similarly to situation with GrLivArea, the GarageArea feature will be excluded.

# Feature Engineering

In [None]:
df.info()

# List of features containing NA's

1. MSZoning, 
2. LotFrontage, 
3. Alley (more than 2000), 
4. Utilities, 
5. Exterior1st, 
6. Exterior2nd, 
7. MasVnrType, 
8. MasVnrArea, 
9. BsmtQual, 
10. BsmtCond, 
11. BsmtExposure, 
12. BsmtFinType1, 
13. BsmtFinSF1, 
14. BsmtFinType2, 
15. BsmtFinSF2, 
16. BsmtUnfSF, 
17. TotalBsmtSF, 
18. Electrical, 
19. BsmtFullBath, 
20. BsmtHalfBath, 
21. KitchenQual, 
22. Functional, 
23. FireplaceQu (more than 1000), 
24. GarageType,
25. GarageYrBlt,
26. GarageFinish,
27. GarageCars,
28. GarageArea,
29. GarageQual,
30. GarageCond,
31. PoolQC(more than 2000), 
32. Fence(more than 2000), 
33. MiscFeature (more than 2000), 
34. SaleType

# Handling with NA's

In [None]:
df.MSZoning[df.MSZoning.isna() == True] = random.choice(['A','C','FV','I','RH','RL','RP','RM'])

In [None]:
df.LotFrontage[df.LotFrontage.isna() == True] = df.LotFrontage.median()

In [None]:
df.Alley[df.Alley.isna()==True] = 'None'

In [None]:
df.Utilities[df.Utilities.isna() == True] = random.choice(['AllPub','NoSewr','NoSeWa','ELO'])

In [None]:
df.Exterior1st[df.Exterior1st.isna()==True] = random.choice(['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc',
                                                'MetalSd','Other','Plywodd','PreCast','Stone','Stucco','VinylSd','Wd Sdng','WdShing'])
df.Exterior2nd[df.Exterior2nd.isna()==True] = random.choice(['AsbShng','AsphShn','BrkComm','BrkFace','CBlock','CemntBd','HdBoard','ImStucc',
                                                'MetalSd','Other','Plywodd','PreCast','Stone','Stucco','VinylSd','Wd Sdng','WdShing'])


In [None]:
df.MasVnrType[df.MasVnrType.isna()==True] = random.choice(['BrkCmn','BrkFace','CBlock','None','Stone'])
df.MasVnrArea[df.MasVnrArea.isna()==True] = df.MasVnrArea.median()

In [None]:
df.BsmtQual[(df.BsmtQual.isna()==True) & (df.BsmtCond.isna()==True) & (df.BsmtExposure.isna()==True) & (df.BsmtFinType1.isna()==True)
   & (df.BsmtFinType2.isna()==True)] = 'None'
df.BsmtCond[(df.BsmtQual.isna()==True) & (df.BsmtCond.isna()==True) & (df.BsmtExposure.isna()==True) & (df.BsmtFinType1.isna()==True)
   & (df.BsmtFinType2.isna()==True)] = 'None'
df.BsmtExposure[(df.BsmtQual.isna()==True) & (df.BsmtCond.isna()==True) & (df.BsmtExposure.isna()==True) & (df.BsmtFinType1.isna()==True)
   & (df.BsmtFinType2.isna()==True)] = 'None'
df.BsmtFinType1[(df.BsmtQual.isna()==True) & (df.BsmtCond.isna()==True) & (df.BsmtExposure.isna()==True) & (df.BsmtFinType1.isna()==True)
   & (df.BsmtFinType2.isna()==True)] = 'None'
df.BsmtFinType2[(df.BsmtQual.isna()==True) & (df.BsmtCond.isna()==True) & (df.BsmtExposure.isna()==True) & (df.BsmtFinType1.isna()==True)
   & (df.BsmtFinType2.isna()==True)] = 'None'

In [None]:
df.BsmtQual[df.BsmtQual.isna()==True] = random.choice(['Ex','Gd','TA','Fa','Po'])
df.BsmtCond[df.BsmtCond.isna()==True] = random.choice(['Ex','Gd','TA','Fa','Po'])
df.BsmtExposure[df.BsmtExposure.isna()==True] = random.choice(['Gd','Av','Mn','No'])
df.BsmtFinType1[df.BsmtFinType1.isna()==True] = random.choice(['GLQ','ALQ','BLQ','Rec','LwQ','Unf'])
df.BsmtFinType2[df.BsmtFinType2.isna()==True] = random.choice(['GLQ','ALQ','BLQ','Rec','LwQ','Unf'])
df.BsmtFinSF1[df.BsmtFinSF1.isna()==True]=df.BsmtFinSF1.median()
df.BsmtFinSF2[df.BsmtFinSF2.isna()==True]=df.BsmtFinSF2.median()
df.BsmtUnfSF[df.BsmtUnfSF.isna()==True] = df.BsmtUnfSF.median()
df.TotalBsmtSF[df.TotalBsmtSF.isna()==True] = df.TotalBsmtSF.median()

In [None]:
df.Electrical[df.Electrical.isna()==True] = random.choice(['SBrkr','FuseA','FuseF','FuseP','Mix'])

In [None]:
df.BsmtFullBath[df.BsmtFullBath.isna()==True] = df.BsmtFullBath.median()
df.BsmtHalfBath[df.BsmtHalfBath.isna()==True] = df.BsmtHalfBath.median()

In [None]:
df.KitchenQual[df.KitchenQual.isna()==True] = random.choice(['Ex','Gd','TA','Fa','Po'])
df.Functional[df.Functional.isna()==True] = random.choice(['Typ','Min1','Min2','Mod','Maj1','Maj2','Sev','Sal'])

In [None]:
df.FireplaceQu[(df.FireplaceQu.isna()==True) & (df.Fireplaces == 0)] = 'None'

In [None]:
df.GarageType[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 'None'
df.GarageFinish[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 'None'
df.GarageQual[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 'None'
df.GarageCond[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 'None'
df.GarageYrBlt[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 0
df.GarageCars[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 0
df.GarageArea[(df.GarageType.isna()==True) & (df.GarageYrBlt.isna()==True) & (df.GarageFinish.isna()==True) & 
              (df.GarageCars.isna()==True) & (df.GarageArea.isna()==True) & (df.GarageQual.isna()==True) 
             & (df.GarageCond.isna()==True)] = 0

In [None]:
df.GarageType[df.GarageType.isna()==True] = random.choice(['2Types','Attchd','Basment','BuiltIn','CarPort','Detchd'])
df.GarageYrBlt[df.GarageYrBlt.isna()==True] = df.GarageYrBlt.median()
df.GarageFinish[df.GarageFinish.isna()==True] = random.choice(['Fin','RFn','Unf'])
df.GarageCars[df.GarageCars.isna()==True] = df.GarageCars.median()
df.GarageArea[df.GarageArea.isna()==True] = df.GarageArea.median()
df.GarageQual[df.GarageQual.isna()==True] = random.choice(['Ex','Gd','TA','Fa','Po'])
df.GarageCond[df.GarageCond.isna()==True] = random.choice(['Ex','Gd','TA','Fa','Po'])

In [None]:
df.PoolQC[(df.PoolQC.isna()==True) & (df.PoolArea == 0)] = 'None'
df.PoolQC[df.PoolQC.isna()==True] = random.choice(['Ex','Gd','TA','Fa'])

In [None]:
df.Fence[df.Fence.isna()==True] = 'None'
df.MiscFeature[df.MiscFeature.isna()==True] = 'None'
df.SaleType[df.SaleType.isna()==True] = random.choice(['WD','CWD','VWD','New','COD','Con','ConLw','ConLI','ConLD','Oth'])

In [None]:
df.info()

In [None]:
df_obj = df.select_dtypes(include='object')
df_obj_t = df[df.SalePrice.isna()==False].select_dtypes(include='object')

labelencoder = preprocessing.LabelEncoder()

In [None]:
df_obj = df_obj.apply(labelencoder.fit_transform)
df_obj["SalePrice"] = df.SalePrice 

df_obj_t = df_obj_t.apply(labelencoder.fit_transform)
df_obj_t["SalePrice"] = train.SalePrice 

In [None]:
fig=plt.figure(figsize=(13,11))
g1 = sns.heatmap(df_obj_t.corr(),cmap='mako_r')
g1.set_xticklabels(g1.get_xticklabels(),rotation=70,fontsize=10)
g1.set_yticklabels(g1.get_yticklabels(),rotation=15,fontsize=10)
plt.title("Correlation Plot of Features")
plt.show()

In [None]:
#saleprice correlation matrix

cols_t = df_obj_t.corr().nlargest(10, 'SalePrice')['SalePrice'].index
cols_t = np.array(list(reversed(cols_t)))
cm_t = df_obj_t[cols_t].corr()
mask_t = np.triu(np.ones_like(cm_t, dtype=np.bool))

f, ax = plt.subplots(figsize=(12, 9))
g1 = sns.heatmap(cm_t,cmap='mako_r',fmt='.2f', annot=True, mask=mask_t)
g1.set_xticklabels(g1.get_xticklabels(),rotation=70,fontsize=10)
g1.set_yticklabels(g1.get_yticklabels(),rotation=15,fontsize=10)
plt.title("Triangle Correlation Plot of Features")
plt.show()

Conclusions:
1. Foundation has the highest correlation with SalePrice
2. Rest of categorical features has similarly low correlation

In next steps for the first try all features will be taken into the account.

In [None]:
labelencoder.fit(df['HouseStyle'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['Neighborhood'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['SaleCondition'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['RoofStyle'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['PavedDrive'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['Electrical'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['GarageQual'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['CentralAir'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
labelencoder.fit(df['Foundation'])
le_name_mapping = dict(zip(labelencoder.classes_, labelencoder.transform(labelencoder.classes_)))
print(le_name_mapping)

In [None]:
sns.pairplot(df_obj_t[cols_t])
plt.show()

Conclusions:
1. Split Foyer and houses with finished 2nd floor are achieving the highest prices
2. We are able to point out neighborhoods with high prices
3. Normal sales and houses which were unfinished during last assessment have the highest price
4. Gable and Hip roof styles increase the price
5. Paved drive increases the price
6. Standard electricity breakers have the highest popularity
7. Average Garage Quality increases the price
8. Houses with central air are achieving the highest prices
9. Brick and Tile or Poured Contrete foundation types have influence on sale price

In [None]:
cols2 = cols_t[:9]

In [None]:
cols = cols[[0,2,4,6,7,8,9]]

In [None]:
num_data = df.dtypes[train.dtypes != "object"].index
# Check the skew of all numerical features
skewed_data = df[num_data].apply(lambda x: skew(x.dropna())).sort_values(ascending=False)
skewness = pd.DataFrame({'Skew' :skewed_data})
posSkew = skewness[skewness['Skew'] > 0.75]
posSkew

In [None]:
#Make a list of skewed features
skewed_feat= list(posSkew.index)

In [None]:
for parameter in skewed_feat:
        df[parameter] = np.log1p(df[parameter])

In [None]:
dff = pd.concat([df[cols],df_obj[cols2]], axis=1)

In [None]:
dff.info()

In [None]:
dff.head()

# Model fitting

### Preparing data for model testing

In [None]:
dff_t = dff[dff.SalePrice.isna()==False]

In [None]:
X_t = dff_t.drop("SalePrice",axis=1)
Y_t = dff_t.SalePrice

In [None]:
sc = StandardScaler()

In [None]:
Xt_train, Xt_test, Yt_train, Yt_test = train_test_split(X_t, Y_t, test_size=0.33, random_state=0)

In [None]:
#Standarizing our data
Xt_train = sc.fit_transform(Xt_train)
Xt_test = sc.transform(Xt_test)

### SVR and MSE result

In [None]:
regressor = svm.SVR(kernel='rbf') #Using Support Vector Regression

In [None]:
param = {'C': [0.1, 1, 100, 1000],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
        'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        }

grids = GridSearchCV(regressor,param,cv=5, scoring = 'neg_mean_squared_error', n_jobs=-1)


grids.fit(Xt_train,Yt_train)
print("Grid search best params: ", grids.best_params_)

In [None]:
regr_svr = svm.SVR(kernel='rbf',C=1,epsilon=0.01,gamma=0.005).fit(Xt_train,Yt_train)

In [None]:
Y_pred_svr = regr_svr.predict(Xt_test)

In [None]:
mean_squared_error(Yt_test, Y_pred_svr, squared=False)

### SVR with PCA and MSE result

In [None]:
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [None]:
pca = PCA()
pipe = Pipeline(steps=[('pca', pca), ('regressor', regressor)])
param_grid = {
    'pca__n_components': [2, 3, 5],
    'regressor__C': [0.1, 1, 100, 1000],
    'regressor__epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
    'regressor__gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
}

In [None]:
grid_pca = GridSearchCV(pipe, param_grid, n_jobs=-1, scoring = 'neg_mean_squared_error')
grid_pca.fit(Xt_train, Yt_train)
print("Grid search best params: ", grid_pca.best_params_)

In [None]:
Y_pred_pca = grid_pca.predict(Xt_test)

In [None]:
mean_squared_error(Yt_test, Y_pred_pca, squared=False)

PCA is not improving the score which might be cause of the fact that new components are linear combination of the original features.

### Linear Regression and MSE result

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
linear_regression = LinearRegression()

In [None]:
parameters = {'fit_intercept':[True,False], 'normalize':[True,False], 'copy_X':[True, False]}
grid_linear = GridSearchCV(linear_regression,parameters, cv=5, scoring = 'neg_mean_squared_error')
grid_linear = grid_linear.fit(Xt_train,Yt_train)
print("Grid search best params: ", grid_linear.best_params_)

In [None]:
linear_regr = LinearRegression(copy_X=True,fit_intercept=True,normalize=True).fit(Xt_train,Yt_train)

In [None]:
Y_pred_linear = linear_regr.predict(Xt_test)

In [None]:
mean_squared_error(Yt_test, Y_pred_linear, squared=False)

### Random Forest Regressor and MSE result

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
random_forest = RandomForestRegressor()

In [None]:
param = {'C': [0.1, 1, 100, 1000],
        'epsilon': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10],
        'gamma': [0.0001, 0.001, 0.005, 0.1, 1, 3, 5]
        }

grids = GridSearchCV(regressor,param,cv=5, scoring = 'neg_mean_squared_error', n_jobs=-1)


grids.fit(Xt_train,Yt_train)
print("Grid search best params: ", grids.best_params_)

In [None]:
forest = RandomForestRegressor(bootstrap=True,max_features='sqrt',n_estimators=100).fit(Xt_train,Yt_train)

In [None]:
Y_pred_forest = forest.predict(Xt_test)

In [None]:
mean_squared_error(Yt_test, Y_pred_forest, squared=False)

# Choosing model with the lowest MSE

In [None]:
dff_train = dff[dff.SalePrice.isna()==False]
dff_test = dff[dff.SalePrice.isna()==True]

In [None]:
X_train = dff_train.drop("SalePrice",axis=1)
Y_train = dff_train.SalePrice

X_test = dff_test.drop("SalePrice",axis=1)

In [None]:
#Standarizing the data
X_train = sc.fit_transform(X_train)

X_test = sc.transform(X_test)

In [None]:
para_grids = {
            "n_estimators" : [10,50,100],
            "max_features" : ["auto", "log2", "sqrt"],
            "bootstrap"    : [True, False]
        }


grid_forest_final = GridSearchCV(random_forest, para_grids,cv=5, scoring = 'neg_mean_squared_error')
grid_forest_final.fit(Xt_train, Yt_train)
print("Grid search best params: ", grid_forest_final.best_params_)

In [None]:
Y_predicted=grid_forest_final.predict(X_test)

In [None]:
Y_predicted = np.exp(Y_predicted)

In [None]:
my_submission = pd.DataFrame({'Id': test.Id, 'SalePrice': Y_predicted})
my_submission

In [None]:
my_submission.to_csv('submission.csv', index=False)