In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [5]:
train_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
test_df = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
train_df.head()

In [6]:
train_df.shape

In [7]:
train_df.describe

In [8]:
train_df.info()

In [9]:
print('\n No. of NULL rows in every column:\n')
train_df.isnull().sum()[train_df.isnull().sum()>0].sort_values(ascending=False)

**Let's remove the columns with high NULL values:**

In [10]:
train_df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis = 1, inplace = True)

In [11]:
print('\n No. of NULL rows in every column:\n')
test_df.isnull().sum()[test_df.isnull().sum()>0].sort_values(ascending=False)

In [12]:
test_df.drop(['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis = 1, inplace = True)

Replace NULL values in other columns with the mean value of this column.

In [13]:
train_df.fillna(train_df.mean(), inplace =True)
test_df.fillna(train_df.mean(), inplace =True)

In [14]:
train_df.isnull().sum()[train_df.isnull().sum()>0].sort_values(ascending=False)

In [15]:
test_df.isnull().sum()[train_df.isnull().sum()>0].sort_values(ascending=False)

In [16]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

figure(figsize = (15,10))
sns.histplot(train_df['SalePrice'],kde=True)
plt.title('Houses sale prices')

***There is a chance that non-numeric column have data that can help us predict the price of a house with more precision. the problem is, that the ML models need the data in numeric form. let's convert the data of this columns to numeric values.**

In [17]:
categorical_cols_df = train_df.select_dtypes(include='object')
cols = categorical_cols_df.columns
fig = plt.figure(figsize = (30,60))
fig.subplots_adjust(hspace=0.3)
len(cols)
for i in range(1,38):
    plt.subplot(19, 2, i)
    sns.boxplot(x=cols[i], y='SalePrice', data= train_df)

**We can see that in many column with categorical data there is a big differences that we can transform to numeric data and use it for better predictions.**

In [18]:
# figure(figsize = (8,8))
# sns.boxplot(x='Street',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.boxplot(x='LotShape',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.boxplot(x='LotConfig',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.boxplot(x='LandSlope',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.boxplot(x='Condition1',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.boxplot(x='BldgType',y='SalePrice', data=train_df)
# figure(figsize = (8,8))
# sns.barplot(x='HouseStyle',y='SalePrice',data = train_df, order = order_HouseStyle)
# figure(figsize = (8,8))
# sns.barplot(x='MSZoning',y='SalePrice',data = train_df, order = order_MSZoning)
# plt.figure(figsize = (20,10))
# sns.barplot(x='Neighborhood',y='SalePrice',data = train_df, order = order_neighborhood)
train_df['Street'] = train_df['Street'].map({'Grvl':0,'Pave':1})
test_df['Street'] = test_df['Street'].map({'Grvl':0,'Pave':1})
train_df['LotShape'] = train_df['LotShape'].map({'Reg':0,'IR1':1,'IR2':2, 'IR3':2})
test_df['LotShape'] = test_df['LotShape'].map({'Reg':0,'IR1':1,'IR2':2, 'IR3':2})
train_df['Utilities'] = train_df['Utilities'].map({'AllPub':1,'NoSeWa':0})
test_df['Utilities'] = test_df['Utilities'].map({'AllPub':1,'NoSeWa':0})
train_df['LotConfig'] = train_df['LotConfig'].map({'Inside':0,'FR2':0,'Corner':0,'CulDSac':1,'FR3':1})
test_df['LotConfig'] = test_df['LotConfig'].map({'Inside':0,'FR2':0,'Corner':0,'CulDSac':1,'FR3':1})
train_df['LandSlope'] = train_df['LandSlope'].map({'Gtl':0,'Mod':1,'Sev':1})
test_df['LandSlope'] = test_df['LandSlope'].map({'Gtl':0,'Mod':1,'Sev':1})
train_df['BldgType'] = train_df['BldgType'].map({'1Fam':1,'2fmCon':0,'Duplex':0, 'TwnhsE':1, 'Twnhs':0})
test_df['BldgType'] = test_df['BldgType'].map({'1Fam':1,'2fmCon':0,'Duplex':0, 'TwnhsE':1, 'Twnhs':0})

order_HouseStyle = train_df.groupby('HouseStyle')['SalePrice'].mean().sort_values(ascending=True).index.values
train_df['HouseStyle'] = train_df['HouseStyle'].map({'1.5Unf':0,'SFoyer':1,'1.5Fin':2, '2.5Unf':3, 'SLvl':4, '1Story':5, '2Story':6, '2.5Fin':7})
test_df['HouseStyle'] = test_df['HouseStyle'].map({'1.5Unf':0,'SFoyer':1,'1.5Fin':2, '2.5Unf':3, 'SLvl':4, '1Story':5, '2Story':6, '2.5Fin':7})

order_MSZoning = train_df.groupby('MSZoning')['SalePrice'].mean().sort_values(ascending=True).index.values
train_df['MSZoning'] = train_df['MSZoning'].fillna('RH')
test_df['MSZoning'] = test_df['MSZoning'].fillna('RH')
train_df['MSZoning'] = train_df['MSZoning'].map({'C (all)':0, 'RM':1, 'RH':2, 'RL':3, 'FV':4})
test_df['MSZoning'] = test_df['MSZoning'].map({'C (all)':0, 'RM':1, 'RH':2, 'RL':3, 'FV':4})

order_neighborhood = train_df.groupby('Neighborhood')['SalePrice'].mean().sort_values(ascending=True).index.values
train_df['Neighborhood'].replace(to_replace=order_neighborhood, value=list(range(0,25)), inplace = True)
test_df['Neighborhood'].replace(to_replace=order_neighborhood, value=list(range(0,25)), inplace = True)

In [None]:
test_df['Street'].value_counts()
test_df['LotShape'].value_counts()
train_df['Utilities'].value_counts()
train_df['LotConfig'].value_counts()
test_df['LotConfig'].value_counts()
test_df['LandSlope'].value_counts()
train_df['BldgType'].value_counts()
train_df['BldgType'].value_counts()
train_df['HouseStyle'].value_counts()
train_df['MSZoning'].value_counts()
test_df['MSZoning'].value_counts()
train_df['Neighborhood'].value_counts()
test_df['Neighborhood'].value_counts()

In [None]:
test_df['MSZoning'].isnull().sum()

In [None]:
test_df['MSZoning'].isnull().sum()

In [19]:
train_df.dtypes

In [20]:
df1 = train_df._get_numeric_data()
df1.head()
df1_test = test_df._get_numeric_data()

In [21]:
import seaborn as sns
plt.figure(figsize=(26, 16))
cor = df1.corr()
heatmap = sns.heatmap(cor, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12);

In [22]:
cor_target = abs(cor["SalePrice"])
relevant_features = cor_target[cor_target>0.05]
relevant_features
relevant_df = df1[relevant_features.index]
relevant_df.head()

In [23]:
relevant_df.isnull().sum()

In [None]:
# cleaned_df = relevant_df.dropna(axis=0, how="any", thresh=None, subset=None, inplace=False)
# cleaned_df.isnull().sum()

In [24]:
plt.figure(figsize=(26, 16))
cor = relevant_df.corr()
heatmap = sns.heatmap(cor, vmin=-1, vmax=1, annot=True, cmap='BrBG')
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':10}, pad=12);

Now, lets establish a "Base line" for our models. we will assume that all the houses prices is the mean price and check how much is this assumption far from the truth.

In [39]:
sum_prices = train_df['SalePrice'].sum()
mean_price = sum_prices/len(train_df['SalePrice'])
sum_error = 0
for i in range(len(train_df['SalePrice'])):
    sum_error = sum_error + abs(train_df['SalePrice'].iloc[i] - mean_price)
    
base_line_pred = sum_error/len(train_df['SalePrice'])  
base_line_pred

57,473$ - that is our "Mean Absolute Error" when we are not using any machine learning model.

In [26]:
relevant_df.columns

In [None]:
relevant_df.info()

In [None]:
relevant_df

In [25]:
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics

feature_cols = ['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'LotShape',
       'LotConfig', 'LandSlope', 'Neighborhood', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'BsmtFinSF1', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt',
       'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', 'ScreenPorch', 'PoolArea']
X = relevant_df[feature_cols]
y = relevant_df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [27]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier(n_estimators=220, max_depth = 40,random_state=1)
rf.fit(X_train,y_train)

y_pred=rf.predict(X_test)

In [28]:
from sklearn import metrics
from sklearn.metrics import mean_squared_error,mean_absolute_error,explained_variance_score, r2_score
print("Mean squared error is:",metrics.mean_squared_error(y_test, y_pred))
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, y_pred))
print('Root Mean Squared Error (RMSE):', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('Explained Variance Score (EVS):',explained_variance_score(y_test,y_pred))
print('R2:',metrics.r2_score(y_test, y_pred))

In [30]:
n_estimators = [5,20,50,100,150,200,220,250,300] # number of trees in the random forest
max_features = ['auto', 'sqrt'] # number of features in consideration at every split
max_depth = [int(x) for x in np.linspace(10, 120, num = 10)] # maximum number of levels allowed in each decision tree
min_samples_split = [2, 6, 10] # minimum sample number to split a node
min_samples_leaf = [1, 3, 4] # minimum sample number that can be stored in a leaf node
bootstrap = [True, False] # method used to sample data points

random_grid = {'n_estimators': n_estimators,

'max_features': max_features,

'max_depth': max_depth,

'min_samples_split': min_samples_split,

'min_samples_leaf': min_samples_leaf,

'bootstrap': bootstrap}

In [31]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf,param_distributions = random_grid,
               n_iter = 100, cv = 5, verbose=2, random_state=35, n_jobs = -1)
rf_random.fit(X_train, y_train)

In [40]:
rf_random.best_params_

In [41]:
rf=RandomForestClassifier(n_estimators = 220,
 max_depth = 83,
 min_samples_split = 2,
 bootstrap = False)
rf.fit(X_train,y_train)

y_pred=rf.predict(X_test)

In [42]:
print("Mean squared error is:",metrics.mean_squared_error(y_test, y_pred))
print('Mean Absolute Error(MAE):', metrics.mean_absolute_error(y_test, y_pred))

**K Nearest Neighbors model**

In [43]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)
X_train

In [44]:
from sklearn.neighbors import KNeighborsClassifier

mean_abs_errors_knn = []
for i in range(2,60,2):
    knn=KNeighborsClassifier(n_neighbors = i)
    knn.fit(X_train,y_train)
    y_pred=knn.predict(X_test)
    mae_knn = metrics.mean_absolute_error(y_test, y_pred)
    mean_abs_errors_knn.append(mae_knn)
    
mean_abs_errors_knn    

In [45]:
x_axis = [x for x in range(2,60,2)]
sns.lineplot(x=x_axis, y=mean_abs_errors_knn)

In [46]:
knn=KNeighborsClassifier(n_neighbors = 1)
knn.fit(X_train,y_train)
y_pred=knn.predict(X_test)
mae_knn_best = metrics.mean_absolute_error(y_test, y_pred)
mae_knn_best

**Linear Regression model**

In [47]:
from sklearn.linear_model import LinearRegression
line_fitter = LinearRegression()
line_fitter.fit(X_train,y_train)
line_fitter_pred = line_fitter.predict(X_test)
line_fitter_pred_mae = metrics.mean_absolute_error(y_test, line_fitter_pred)
line_fitter_pred_mae

In [48]:
full_data_rf = RandomForestClassifier(n_estimators = 220,
 max_depth = 22,
 min_samples_split = 2,
 bootstrap = False, random_state=1)
full_data_rf.fit(X,y)
test_data = test_df[feature_cols]
test_preds = full_data_rf.predict(test_data)
full_data_mae = metrics.mean_absolute_error(test_preds, y[0:1459]) 
full_data_mae

In [49]:
full_data_LR = LinearRegression()
full_data_LR.fit(X,y)
test_data_LR = test_df[feature_cols]
test_preds_LR = full_data_LR.predict(test_data_LR)
full_data_mae_LR = metrics.mean_absolute_error(test_preds_LR, y[0:1459]) 
full_data_mae_LR

In [50]:
full_data_KNN = KNeighborsClassifier(n_neighbors = 1)
full_data_KNN.fit(X,y)
test_data_KNN = test_df[feature_cols]
test_preds_KNN = full_data_KNN.predict(test_data_KNN)
full_data_mae_KNN = metrics.mean_absolute_error(test_preds_KNN, y[0:1459]) 
full_data_mae_KNN

When we Testing our models on the test dataframe - the model that was the closest is the "Random Forest". So we will submit it's predictions. 

In [None]:
output = pd.DataFrame({'Id': test_df.Id,
                       'SalePrice': test_preds})
output.to_csv('submission.csv', index=False)
output