In [142]:
import pandas as pd
import scipy as sc
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import scale
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import r2_score

#!pip install plotly
#!pip install -U kaleido

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import seaborn as sns
import matplotlib as mpl
from matplotlib import pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")


In [143]:
forsale_df=pd.read_csv('RealEstateNewYork_Clean.csv',sep=',',low_memory=False)
forsale_df.drop(forsale_df.filter(regex="Unname"),axis=1, inplace=True)

In [144]:
forsale_df

Unnamed: 0,price,beds,baths,garage,stories,house_type,lot_sqft,sqft,year_built,address,state,city,county
0,139900.0,3.0,2.0,0.0,2.0,single_family,12632.0,1190.0,1920,154 Maple Ave,NY,Delanson,Schenectady
1,395000.0,4.0,3.0,2.0,2.0,single_family,30056.0,2987.0,1982,1169 Hidden Valley Trl,NY,Webster,Monroe
2,185000.0,4.0,2.0,1.0,1.0,single_family,7501.0,1863.0,1965,7869 Oneida Trl,NY,Bridgeport,Onondaga
3,440000.0,4.0,3.0,2.0,2.0,single_family,17860.0,1940.0,1965,16 Brookland Farms Rd,NY,Poughkeepsie,Dutchess
4,975700.0,5.0,6.0,3.0,2.0,single_family,25544.0,5660.0,1999,7534 Plum Hollow Cir,NY,Liverpool,Onondaga
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7993,1049000.0,6.0,2.0,1.0,2.0,multi_family,2896.0,1725.0,1920,74-15 88th Ave,NY,Woodhaven,Queens
7994,1495000.0,1.0,1.0,0.0,16.0,coop,10454.0,1725.0,1925,45 5th Ave Apt 17C,NY,New York City,New York
7995,1395000.0,3.0,1.0,0.0,5.0,condos,10454.0,1725.0,1890,705 Carroll St Apt 4R,NY,New York City,Kings
7996,4500000.0,6.0,4.0,0.0,4.0,townhomes,10454.0,1725.0,1958,608 3rd St,NY,New York City,Kings


In [145]:
##dropping unwanted cols for the ml 
forsale_df_ml = forsale_df.copy()
forsale_df_ml.drop(["address","house_type","state","city","county"], axis=1, inplace=True)
repc=forsale_df_ml.pop('price')
forsale_df_ml['price']=repc
forsale_df_ml

Unnamed: 0,beds,baths,garage,stories,lot_sqft,sqft,year_built,price
0,3.0,2.0,0.0,2.0,12632.0,1190.0,1920,139900.0
1,4.0,3.0,2.0,2.0,30056.0,2987.0,1982,395000.0
2,4.0,2.0,1.0,1.0,7501.0,1863.0,1965,185000.0
3,4.0,3.0,2.0,2.0,17860.0,1940.0,1965,440000.0
4,5.0,6.0,3.0,2.0,25544.0,5660.0,1999,975700.0
...,...,...,...,...,...,...,...,...
7993,6.0,2.0,1.0,2.0,2896.0,1725.0,1920,1049000.0
7994,1.0,1.0,0.0,16.0,10454.0,1725.0,1925,1495000.0
7995,3.0,1.0,0.0,5.0,10454.0,1725.0,1890,1395000.0
7996,6.0,4.0,0.0,4.0,10454.0,1725.0,1958,4500000.0


In [146]:
repc

0        139900.0
1        395000.0
2        185000.0
3        440000.0
4        975700.0
          ...    
7993    1049000.0
7994    1495000.0
7995    1395000.0
7996    4500000.0
7997     570000.0
Name: price, Length: 7998, dtype: float64

In [147]:
X =forsale_df_ml.drop(['price'],axis=1)
y =forsale_df_ml['price']

In [148]:
X

Unnamed: 0,beds,baths,garage,stories,lot_sqft,sqft,year_built
0,3.0,2.0,0.0,2.0,12632.0,1190.0,1920
1,4.0,3.0,2.0,2.0,30056.0,2987.0,1982
2,4.0,2.0,1.0,1.0,7501.0,1863.0,1965
3,4.0,3.0,2.0,2.0,17860.0,1940.0,1965
4,5.0,6.0,3.0,2.0,25544.0,5660.0,1999
...,...,...,...,...,...,...,...
7993,6.0,2.0,1.0,2.0,2896.0,1725.0,1920
7994,1.0,1.0,0.0,16.0,10454.0,1725.0,1925
7995,3.0,1.0,0.0,5.0,10454.0,1725.0,1890
7996,6.0,4.0,0.0,4.0,10454.0,1725.0,1958


In [149]:
y

0        139900.0
1        395000.0
2        185000.0
3        440000.0
4        975700.0
          ...    
7993    1049000.0
7994    1495000.0
7995    1395000.0
7996    4500000.0
7997     570000.0
Name: price, Length: 7998, dtype: float64

In [150]:
X = X.apply(LabelEncoder().fit_transform)

In [151]:
cols=X.columns.tolist()
for i in range(0, len(cols)):
    X[cols[i]] = scale(X[cols[i]])

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42, test_size=0.4)

In [153]:
linear_model = LinearRegression()

linear_model.fit(X_train,y_train)

y_pred = linear_model.predict(X_test)
y_train_pred = linear_model.predict(X_train)

In [154]:
print("r2 score with LinearRegression model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))

r2 score with LinearRegression model
Accuracy on training data:  0.27193085391651683
Accuracy on test data:  0.22952849396635933


In [155]:
print (forsale_df.columns.tolist())

['price', 'beds', 'baths', 'garage', 'stories', 'house_type', 'lot_sqft', 'sqft', 'year_built', 'address', 'state', 'city', 'county']


In [156]:
forsale_cp_df_ml=forsale_df.copy()
featurs =forsale_cp_df_ml.drop(["price","county","address","baths","stories"],axis=1)


In [157]:
##create dummies and combine 
combine_dummies = pd.get_dummies(featurs)
result = combine_dummies.values

In [158]:
scaler = StandardScaler()
result = scaler.fit_transform(result)

In [159]:
X= result[:forsale_cp_df_ml.shape[0]]
y=forsale_cp_df_ml['price'].values

In [160]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.4)

In [161]:
lasso_model= Lasso()

lasso_model.fit(X_train, y_train)

y_pred = lasso_model.predict(X_test)
y_train_pred = lasso_model.predict(X_train)

In [162]:
print("r2 score with Lasso model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))

r2 score with Lasso model
Accuracy on training data:  0.6069217567996845
Accuracy on test data:  0.32367151133199235


In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'bootstrap': [True],
    'max_depth': [60, 70, 80, 90],
    'max_features': ['auto'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [2, 4, 6],
    'n_estimators': [100, 144, 300, 600]
    }
rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)
grid_search.fit(X_train,y_train)

Fitting 3 folds for each of 144 candidates, totalling 432 fits


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [None]:
#rf_model=grid_search.best_estimator_
rf_model=RandomForestRegressor(bootstrap= True,max_depth=15,max_features=16,min_samples_leaf=2,min_samples_split= 2,n_estimators=144)
rf_model.fit(X_train,y_train)

y_pred = rf_model.predict(X_test)
y_train_pred = rf_model.predict(X_train)

print("r2 score with RendomForestReg model")
print("Accuracy on training data: " , r2_score(y_train, y_train_pred))
print("Accuracy on test data: ", r2_score(y_test, y_pred))