# Machine Learning Prediction 

After all the processes we made, our main goal is to predict the price of the houses based on the given features.
For this, we used three different regressions models to look for the best results:
    
    1. Linear regression
    2. Lasso 
    3. Bayesian Ridge
    
For each model, we preprocessed the data by encoding the categorical features and scaling the values.
The X represent all the parameters that were given on the assets. 
The Y are the sell prices value.

Eventually we took the model with the best price prediction results.

In [1]:
import pandas as pd
import scipy as sc
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # ElasticNet algorithm

from sklearn.metrics import r2_score

#!pip install plotly
#!pip install -U kaleido

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


## Importing the data

In [2]:
forsale_df=pd.read_csv('RealEstateNewYork_Clean.csv',sep=',',low_memory=False)
forsale_df.drop(forsale_df.filter(regex="Unname"),axis=1, inplace=True)

In [3]:
forsale_df

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county,price_per_sqft
0,294900.0,3.0,2.0,2.0,2.0,single_family,74052.0,996.0,2011,16326 Ontario Shores Dr,NY,Sterling,Cayuga,296.084337
1,225000.0,3.0,2.0,1.0,2.0,single_family,30056.0,1224.0,1973,38 Pine Cir,NY,Newfield,Tompkins,183.823529
2,149000.0,4.0,2.0,2.0,2.0,single_family,223898.0,1608.0,1900,8 Gridleyville Rd,NY,Spencer,Tioga,92.661692
3,599999.0,4.0,2.0,0.0,2.0,single_family,7307.0,1827.0,1858,59 Hamilton Ave,NY,Oyster Bay,Nassau,328.406678
4,299900.0,3.0,2.0,1.0,2.0,single_family,8712.0,1589.0,1960,41 Bender Ln,NY,Bethlehem,Albany,188.735053
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7723,449000.0,2.0,1.0,0.0,2.0,coop,10550.0,1000.0,1963,2483 W 16th St Apt 12J,NY,Brooklyn,Kings,449.000000
7724,975000.0,1.0,2.0,0.0,2.0,condos,10550.0,826.0,1987,75 Wall St Apt 24K,NY,New York City,New York,1180.387409
7725,689000.0,3.0,3.0,1.0,2.0,single_family,7475.0,2100.0,2022,223 Endicott Ave,NY,Elmsford,Westchester,328.095238
7726,1862500.0,5.0,4.0,1.0,2.0,single_family,4920.0,2750.0,1955,147-40 8th Ave,NY,Whitestone,Queens,677.272727


## Linear Regression model

In [4]:
forsale_df_ml = forsale_df.copy()
forsale_df_ml.drop(["address","house_type","state","city","county"], axis=1, inplace=True)
forsale_df_ml['address'] = LabelEncoder().fit_transform(forsale_df['address'])
forsale_df_ml['house_type'] = LabelEncoder().fit_transform(forsale_df['house_type'])
forsale_df_ml['state'] = LabelEncoder().fit_transform(forsale_df['state'])
forsale_df_ml['city'] = LabelEncoder().fit_transform(forsale_df['city'])
forsale_df_ml['county'] = LabelEncoder().fit_transform(forsale_df['county'])
repc=forsale_df_ml.pop('price')
forsale_df_ml['price']=repc
forsale_df_ml

Unnamed: 0,beds,baths,garage,stories,lot_sqft,sqft,year_built,price_per_sqft,address,house_type,state,city,county,price
0,3.0,2.0,2.0,2.0,74052.0,996.0,2011,296.084337,1883,4,0,945,5,294900.0
1,3.0,2.0,1.0,2.0,30056.0,1224.0,1973,183.823529,4636,4,0,686,54,225000.0
2,4.0,2.0,2.0,2.0,223898.0,1608.0,1900,92.661692,6949,4,0,933,53,149000.0
3,4.0,2.0,0.0,2.0,7307.0,1827.0,1858,328.406678,6007,4,0,742,29,599999.0
4,3.0,2.0,1.0,2.0,8712.0,1589.0,1960,188.735053,4911,4,0,83,0,299900.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7723,2.0,1.0,0.0,2.0,10550.0,1000.0,1963,449.000000,3221,1,0,119,23,449000.0
7724,1.0,2.0,0.0,2.0,10550.0,826.0,1987,1180.387409,6793,0,0,679,30,975000.0
7725,3.0,3.0,1.0,2.0,7475.0,2100.0,2022,328.095238,2893,4,0,316,59,689000.0
7726,5.0,4.0,1.0,2.0,4920.0,2750.0,1955,677.272727,1496,4,0,1058,40,1862500.0


In [5]:
Xlin = forsale_df_ml.iloc[:,:-1]
ylin = forsale_df_ml.iloc[:,-1]

### Modeling and scores

In [6]:
Xlin_train, Xlin_test, ylin_train, ylin_test = train_test_split(Xlin, ylin,random_state=0 , test_size=0.4)

In [7]:
linear_model = LinearRegression(fit_intercept=False)

linear_model.fit(Xlin_train,ylin_train)

ylin_pred = linear_model.predict(Xlin_test)
ylin_train_pred = linear_model.predict(Xlin_train)
ylin_pred

array([  19342.98301713, 1264522.41433167, 1415511.10219062, ...,
        652883.41015392,   83648.15523203,   45423.95975931])

In [8]:
print("r2 score with LinearRegression model")
print("Accuracy on training data: " , r2_score(ylin_train, ylin_train_pred))
print("Accuracy on test data: ", r2_score(ylin_test, ylin_pred))

r2 score with LinearRegression model
Accuracy on training data:  0.8246892160595706
Accuracy on test data:  0.8042198286917305


## Lasso model

In [9]:
print (forsale_df.columns.tolist())

['price', 'beds', 'baths', 'garage', 'stories', 'house_type', 'lot_sqft', 'sqft', 'year_built', 'address', 'state', 'city', 'county', 'price_per_sqft']


In [10]:
forsale_cp_df_ml=forsale_df.copy()
featurs =forsale_cp_df_ml.drop(["price","county","address","state"],axis=1)

create dummies and combine

In [11]:
combine_dummies = pd.get_dummies(featurs)
result = combine_dummies.values

In [12]:
scaler = StandardScaler()
result = scaler.fit_transform(result)

In [13]:
Xlas= result[:forsale_cp_df_ml.shape[0]]
ylas=forsale_cp_df_ml['price'].values

### Modeling and scores

In [14]:
Xlas_train, Xlas_test, ylas_train, ylas_test = train_test_split(Xlas, ylas, random_state=42, test_size=0.4)

In [15]:
lasso_model= Lasso()

lasso_model.fit(Xlas_train, ylas_train)

ylas_pred = lasso_model.predict(Xlas_test)
ylas_train_pred = lasso_model.predict(Xlas_train)
ylas_pred

array([ 158210.85054795,  773248.28753809,  204106.35984996, ...,
        545702.27412501,  278655.31995597, 1015569.46360946])

In [16]:
print("r2 score with Lasso model")
print("Accuracy on training data: " , r2_score(ylas_train, ylas_train_pred))
print("Accuracy on test data: ", r2_score(ylas_test, ylas_pred))

r2 score with Lasso model
Accuracy on training data:  0.8596987055644911
Accuracy on test data:  0.8074682093893724


## BayesianRidge Model

In [17]:
print (forsale_df.columns.tolist())

['price', 'beds', 'baths', 'garage', 'stories', 'house_type', 'lot_sqft', 'sqft', 'year_built', 'address', 'state', 'city', 'county', 'price_per_sqft']


In [18]:
forsale_cp2_df_ml=forsale_df.copy()
featurs =forsale_cp2_df_ml.drop(["price","county","address","state"],axis=1)

In [19]:
Xbay= result[:forsale_cp2_df_ml.shape[0]]
ybay=forsale_cp2_df_ml['price'].values

### Modeling and scores

In [20]:
Xbay_train, Xbay_test, ybay_train, ybay_test = train_test_split(Xbay, ybay, random_state=42, test_size=0.4)

In [21]:
bayesian = BayesianRidge()
bayesian.fit(Xbay_train, ybay_train)
ybay_pred = bayesian.predict(Xbay_test)
ybay_train_pred = bayesian.predict(Xbay_train)

In [22]:
print("r2 score with BayesianRidge model")
print("Accuracy on training data: " , r2_score(ybay_train, ybay_train_pred))
print("Accuracy on test data: ", r2_score(ybay_test, ybay_pred))

r2 score with BayesianRidge model
Accuracy on training data:  0.8558451612918336
Accuracy on test data:  0.7998087860944406


### Add BEST predicted prices to original data frame

In [23]:
final_labels = lasso_model.predict(Xlas)
final_result = pd.DataFrame({'price': final_labels})

In [24]:
forsale_df_ml["predicted price"] = final_result['price']

In [25]:
price_avg=forsale_df_ml["price"].mean()
pred_price_avg=forsale_df_ml["predicted price"].mean()

print("True prices average:",round(price_avg,2))
print("Predicted prices average:",round(pred_price_avg,2))

True prices average: 663308.42
Predicted prices average: 662135.04


### Conclusions:

    1. We found which parameters influent the most the sell price of the asset.
    2. After deploying our predictive models we saw that the "before and after" feature engineering is crutial
