In [12]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.preprocessing import PolynomialFeatures
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import LinearSVC

In [13]:
### Preprocessing
# I'm reading the 
df1 = pd.read_csv("F20 P1-Table 1.csv", na_values=['nan'])

### Deleting yellow columns
df1 = df1.drop(['Status', 'For Tax Year', 'Gross Taxes', 'Original Price', 'List Price', 'GST Incl'], axis=1)

### NaN's replacing with 0's
df1.replace('nan', np.nan, inplace=True)
df1.fillna(0, inplace=True)

df1.head()

Unnamed: 0,Address,distance,acessibility,S/A,Price,Sold Date,Days On Market,Age,Area,Total Bedrooms,...,Zoning,Parking Places - Covered,# Rms,No. Floor Levels,Frontage - Feet,Depth,Type,Unnamed: 30,Unnamed: 31,Public Remarks
0,12821 114 AVENUE,0.0,0.0,F21,"$610,000",9/12/20,40,999,North Surrey,3,...,RF,0.0,6,2,50.0,100.0,House/Single Family,0.0,0.0,Investor's alert. 3 bedroom tenanted home with...
1,12860 114A AVENUE,0.0,0.0,F21,"$675,000",11/15/20,18,999,North Surrey,1,...,RS1,0.0,5,1,50.0,99.9,House/Single Family,0.0,0.0,"WHY RENT? Apartment size, 1 bedroom, modern, e..."
2,12646 113 AVENUE,0.0,0.0,F21,"$690,000",7/3/20,14,64,North Surrey,2,...,SFR,1.0,6,1,0.0,0.0,House/Single Family,0.0,0.0,INVESTORS and FIRST TIME HOME BUYERS ALERT! 2 ...
3,10669 160 STREET,0.0,0.0,F23,"$700,000",8/14/20,71,62,North Surrey,3,...,RA,0.0,11,2,61.22,134.81,House/Single Family,0.0,0.0,**LARGE 8255 sqft LOT****PERFECT FOR INVESTORS...
4,11457 125A STREET,0.0,0.0,F21,"$700,000",11/4/20,28,70,North Surrey,2,...,RF,0.0,6,1,61.5,120.0,House/Single Family,0.0,0.0,Tastefully renovated 2 bed 1bath house with de...


In [14]:
y = df1['Price']
df1 = df1.drop(['Price', 'Unnamed: 30', 'Unnamed: 31', 'Public Remarks', 'Sold Date', 'distance', 'acessibility', 'Area'], axis=1)

y = y.str.replace('$','')
y = y.str.replace(',','')
y = y.astype(float)

In [15]:
X1 = pd.get_dummies(df1['Address'])
X2 = pd.get_dummies(df1['S/A'])
X3 = pd.get_dummies(df1['Driveway Finish'])
X4 = pd.get_dummies(df1['Foundation'])
X5 = pd.get_dummies(df1['Zoning'])
X6 = pd.get_dummies(df1['Type'])

X_train_1 = np.concatenate((X1, X2, X3, X4, X5, X6), axis = 1)

df = df1.drop(['Address', 'S/A', 'Driveway Finish', 'Foundation', 'Zoning', 'Type', 'Depth', 'Frontage - Feet'], axis=1) # Depth and Frontage feet ???

df['Lot Sz (Sq.Ft.)'] = df['Lot Sz (Sq.Ft.)'].str.replace(',','').astype(float)
df['Floor Area -Grand Total'] = df['Floor Area -Grand Total'].str.replace(',','').astype(float)
df['Floor Area - Unfinished'] = df['Floor Area - Unfinished'].str.replace(',','').astype(float)
df['Floor Area Fin - Basement'] = df['Floor Area Fin - Basement'].str.replace(',','').astype(float)

df

poly = PolynomialFeatures(2)
X_train_2 = poly.fit_transform(df)

In [16]:
X_train = np.concatenate((X_train_1, X_train_2), axis = 1)

X_train.shape

(558, 700)

In [17]:
### Feature selection
lsvc = LinearSVC(penalty="l1", max_iter=5000, dual=False).fit(X_train, y)
model = SelectFromModel(lsvc, prefit=True, threshold=1e-27, max_features=500)
X_train = model.transform(X_train)



In [18]:
### Model
estimators = [
    ('lr1', RidgeCV()),
    ('lr2', RandomForestRegressor(n_estimators=10, random_state=42)),
    ('lr3', xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, gamma=0, subsample=0.75, colsample_bytree=1, max_depth=7)),
    ('lr4', DecisionTreeRegressor(random_state=0))
    ]
xgb_r = StackingRegressor(
    estimators=estimators,
    final_estimator=Ridge(311)
)

In [19]:
print('started fitting')
xgb_r.fit(X_train, y)

started fitting


StackingRegressor(estimators=[('lr1',
                               RidgeCV(alphas=array([ 0.1,  1. , 10. ]))),
                              ('lr2',
                               RandomForestRegressor(n_estimators=10,
                                                     random_state=42)),
                              ('lr3',
                               XGBRegressor(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=1, gamma=0,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.1,
                                            max_delta_..., max_depth=7,
                                            min_child_weight=None, missing=nan,
                

In [20]:
print('model fitted')
y_train_pred = xgb_r.predict(X_train)

from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y, y_train_pred))

model fitted




52237.24340753602

In [21]:
y_train = np.array(y)
y_final = np.abs((y_train_pred - y_train)/y_train_pred * 100)

final_matrix = [0, 0, 0, 0]
for i in y_final:
    if i < 2:
        final_matrix[0]+=1
    elif i < 3:
        final_matrix[1]+=1
    elif i < 5:
        final_matrix[2]+=1
    else:
        final_matrix[3]+=1
        
for i in range(4):
    final_matrix[i] = final_matrix[i]/len(y_final)

In [22]:
# results for 2-nd order polynomial, 50000 iterations
print(final_matrix)

np.sum(final_matrix[:3])

[0.25627240143369173, 0.17921146953405018, 0.3727598566308244, 0.1917562724014337]


0.8082437275985663

In [13]:
# results for 2-nd order polynomial, 50000 iterations, 13mins

print(final_matrix)

np.sum(final_matrix[:3])

[0.4121863799283154, 0.1935483870967742, 0.2132616487455197, 0.18100358422939067]


0.8189964157706093

In [13]:
# results for 2-nd order polynomial, 35000 iterations, 10mins? 

print(final_matrix)

np.sum(final_matrix[:3])

[0.023297491039426525, 0.008960573476702509, 0.021505376344086023, 0.946236559139785]


0.053763440860215055

In [14]:
# results for 2-nd order polynomial, 20000 iterations, 6mins

print(final_matrix)

np.sum(final_matrix[:3])

[0.26344086021505375, 0.18996415770609318, 0.26344086021505375, 0.2831541218637993]


0.7168458781362007

In [21]:
# results for 2-nd order polynomial, 1000 iterations, 20sec

print(final_matrix)

np.sum(final_matrix[:3])

[0.023297491039426525, 0.008960573476702509, 0.021505376344086023, 0.946236559139785]


0.053763440860215055