# Machine Learning Prediction 

In [1]:
import pandas as pd
import scipy as sc
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression 
from sklearn.linear_model import Lasso # Lasso algorithm
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import BayesianRidge # Bayesian algorithm
from sklearn.linear_model import ElasticNet # ElasticNet algorithm

from sklearn.metrics import r2_score

#!pip install plotly
#!pip install -U kaleido

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


## Importing the data

In [2]:
forsale_df=pd.read_csv('RealEstateNewYork_Clean2.csv',sep=',',low_memory=False)
forsale_df.drop(forsale_df.filter(regex="Unname"),axis=1, inplace=True)

In [3]:
forsale_df

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county,price_per_sqft
0,108900.0,4.0,2.0,2.0,2.0,single_family,108900.0,1800.0,1971,4758 Cory Corners Rd,NY,Marion,Wayne,60.500000
1,499000.0,3.0,2.0,2.0,2.0,single_family,22425.0,1826.0,1980,8105 McCamidge Dr,NY,Cicero,Onondaga,273.274918
2,499000.0,3.0,2.0,1.0,2.0,single_family,10454.0,2680.0,1947,349 Glen Ave,NY,Elmira,Chemung,186.194030
3,499000.0,3.0,2.0,2.0,2.0,single_family,7662.0,1312.0,1951,812 Delaware Rd,NY,Buffalo,Erie,380.335366
4,126324.0,3.0,2.0,3.0,2.0,single_family,126324.0,2116.0,1995,2430 Emerson Rd,NY,Weedsport,Cayuga,59.699433
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7719,499000.0,1.0,1.0,0.0,6.0,coop,10602.0,1744.0,1939,34-20 79th St Unit 2H,NY,New York City,Queens,286.123853
7720,499000.0,3.0,2.0,1.0,2.0,single_family,5000.0,912.0,1955,455 Stoneham St,NY,Staten Island,Richmond,547.149123
7721,499000.0,1.0,1.0,0.0,2.0,condos,10602.0,760.0,1982,3611 Shore Parkway Rd Unit 2D,NY,Brooklyn,Kings,656.578947
7722,499000.0,1.0,1.0,0.0,2.0,coop,10602.0,1744.0,1930,465 W 23rd St Apt 7A,NY,New York City,New York,286.123853


## Linear Regression model

In [4]:
forsale_df_ml = forsale_df.copy()
forsale_df_ml.drop(["address","house_type","state","city","county"], axis=1, inplace=True)
forsale_df_ml['address'] = LabelEncoder().fit_transform(forsale_df['address'])
forsale_df_ml['house_type'] = LabelEncoder().fit_transform(forsale_df['house_type'])
forsale_df_ml['state'] = LabelEncoder().fit_transform(forsale_df['state'])
forsale_df_ml['city'] = LabelEncoder().fit_transform(forsale_df['city'])
forsale_df_ml['county'] = LabelEncoder().fit_transform(forsale_df['county'])
repc=forsale_df_ml.pop('price')
forsale_df_ml['price']=repc
forsale_df_ml

Unnamed: 0,beds,baths,garage,stories,lot_sqft,sqft,year_built,price_per_sqft,address,house_type,state,city,county,price
0,4.0,2.0,2.0,2.0,108900.0,1800.0,1971,60.500000,5366,4,0,586,58,108900.0
1,3.0,2.0,2.0,2.0,22425.0,1826.0,1980,273.274918,7042,4,0,185,33,499000.0
2,3.0,2.0,1.0,2.0,10454.0,2680.0,1947,186.194030,4359,4,0,310,7,499000.0
3,3.0,2.0,2.0,2.0,7662.0,1312.0,1951,380.335366,7048,4,0,124,14,499000.0
4,3.0,2.0,3.0,2.0,126324.0,2116.0,1995,59.699433,3184,4,0,1013,5,126324.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7719,1.0,1.0,0.0,6.0,10602.0,1744.0,1939,286.123853,4298,1,0,674,40,499000.0
7720,3.0,2.0,1.0,2.0,5000.0,912.0,1955,547.149123,5253,4,0,937,42,499000.0
7721,1.0,1.0,0.0,2.0,10602.0,760.0,1982,656.578947,4507,0,0,117,23,499000.0
7722,1.0,1.0,0.0,2.0,10602.0,1744.0,1930,286.123853,5311,1,0,674,30,499000.0


In [5]:
Xlin = forsale_df_ml.iloc[:,:-1]
ylin = forsale_df_ml.iloc[:,-1]

### Modeling and scores

In [6]:
Xlin_train, Xlin_test, ylin_train, ylin_test = train_test_split(Xlin, ylin,random_state=0 , test_size=0.4)

In [8]:
linear_model = LinearRegression(fit_intercept=False)

linear_model.fit(Xlin_train,ylin_train)

ylin_pred = linear_model.predict(Xlin_test)
ylin_train_pred = linear_model.predict(Xlin_train)
ylin_pred

array([490249.53460114, 454127.70725045, 465049.62229142, ...,
       449256.89499095, 389369.52612379, 474113.11117391])

In [10]:
print("r2 score with LinearRegression model")
print("Accuracy on training data: " , r2_score(ylin_train, ylin_train_pred))
print("Accuracy on test data: ", r2_score(ylin_test, ylin_pred))

r2 score with LinearRegression model
Accuracy on training data:  0.6617220026766473
Accuracy on test data:  0.6094611581118379


## Lasso model

In [20]:
print (forsale_df.columns.tolist())

['price', 'beds', 'baths', 'garage', 'stories', 'house_type', 'lot_sqft', 'sqft', 'year_built', 'address', 'state', 'city', 'county', 'price_per_sqft']


In [21]:
forsale_cp_df_ml=forsale_df.copy()
#featurs =forsale_cp_df_ml.drop(["price","county","address","baths","stories"],axis=1)
featurs =forsale_cp_df_ml.drop(["price","county","address","state"],axis=1)

In [22]:
#create dummies and combine 
combine_dummies = pd.get_dummies(featurs)
result = combine_dummies.values

In [23]:
scaler = StandardScaler()
result = scaler.fit_transform(result)

In [24]:
Xlas= result[:forsale_cp_df_ml.shape[0]]
ylas=forsale_cp_df_ml['price'].values

### Modeling and scores

In [25]:
Xlas_train, Xlas_test, ylas_train, ylas_test = train_test_split(Xlas, ylas, random_state=42, test_size=0.4)

In [26]:
lasso_model= Lasso()

lasso_model.fit(Xlas_train, ylas_train)

ylas_pred = lasso_model.predict(Xlas_test)
ylas_train_pred = lasso_model.predict(Xlas_train)
ylas_pred

array([472359.82134921, 457345.80064354, 486275.30934641, ...,
       452823.54245317, 460401.27899587, 501478.43210016])

In [27]:
print("r2 score with Lasso model")
print("Accuracy on training data: " , r2_score(ylas_train, ylas_train_pred))
print("Accuracy on test data: ", r2_score(ylas_test, ylas_pred))

r2 score with Lasso model
Accuracy on training data:  0.8063358708944527
Accuracy on test data:  0.5964010552344168


## BayesianRidge Model

In [None]:
print (forsale_df.columns.tolist())

In [28]:
forsale_cp2_df_ml=forsale_df.copy()
featurs =forsale_cp2_df_ml.drop(["price","county","address","state"],axis=1)

In [30]:
Xbay= result[:forsale_cp2_df_ml.shape[0]]
ybay=forsale_cp2_df_ml['price'].values

### Modeling and scores

In [34]:
Xbay_train, Xbay_test, ybay_train, ybay_test = train_test_split(Xbay, ybay, random_state=42, test_size=0.4)

In [35]:
bayesian = BayesianRidge()
bayesian.fit(Xbay_train, ybay_train)
ybay_pred = bayesian.predict(Xbay_test)
ybay_train_pred = bayesian.predict(Xbay_train)

In [37]:
print("r2 score with BayesianRidge model")
print("Accuracy on training data: " , r2_score(ybay_train, ybay_train_pred))
print("Accuracy on test data: ", r2_score(ybay_test, ybay_pred))

r2 score with BayesianRidge model
Accuracy on training data:  0.7993297959962544
Accuracy on test data:  0.5481081631462773


### Modeling and scores

In [63]:
final_labels = lasso_model.predict(Xlas)
final_result = pd.DataFrame({'price': final_labels})

In [64]:
forsale_df_ml["predicted price"] = final_result['price']

In [65]:
price_avg=forsale_df_ml["price"].mean()
pred_price_avg=forsale_df_ml["predicted price"].mean()

print("True prices average:",round(price_avg,2))
print("Predicted prices average:",round(pred_price_avg,2))

True prices average: 512999.26
Predicted prices average: 511457.79
