In [1]:
#sklearn libraries
from sklearn.preprocessing import OneHotEncoder, StandardScaler, PolynomialFeatures
from sklearn.model_selection import cross_val_score, train_test_split, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

#models
from lightgbm import LGBMRegressor

#pandas
import pandas as pd
import numpy as np

#optimization
import optuna

#ploting lib
import plotly.express as px

#python libraries
import pickle
import json
import glob

In [2]:
class CONFIG:
    numeric_features= ["max_floor_lvl","floor_area_sqm","sale_year","remaining_lease_months","latitude","longitude"]
    categorical_features = ['town','flat_type','flat_model','storey_range','residential',
                            'commercial','market_hawker','miscellaneous','multistorey_carpark',
                            'precinct_pavilion',"sale_month"]

In [3]:
model_fps = glob.glob("../artifacts/model_pipeline*.pkl")
model_fps

['../artifacts\\model_pipeline_fold_0.pkl',
 '../artifacts\\model_pipeline_fold_1.pkl',
 '../artifacts\\model_pipeline_fold_2.pkl',
 '../artifacts\\model_pipeline_fold_3.pkl',
 '../artifacts\\model_pipeline_fold_4.pkl',
 '../artifacts\\model_pipeline_fold_5.pkl',
 '../artifacts\\model_pipeline_fold_6.pkl',
 '../artifacts\\model_pipeline_fold_7.pkl',
 '../artifacts\\model_pipeline_fold_8.pkl',
 '../artifacts\\model_pipeline_fold_9.pkl']

In [4]:
test_data = pd.read_csv("../data/unique_block_test_df.csv", index_col=0)
test_features = test_data[CONFIG.numeric_features+CONFIG.categorical_features]
test_features

Unnamed: 0,max_floor_lvl,floor_area_sqm,sale_year,remaining_lease_months,latitude,longitude,town,flat_type,flat_model,storey_range,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,sale_month
0,16.0,64.500000,2022,675,1.303489,103.864529,KALLANG/WHAMPOA,3 ROOM,Improved,01 TO 03,Y,Y,N,N,N,N,6
1,16.0,71.600000,2022,675,1.303489,103.864529,KALLANG/WHAMPOA,3 ROOM,Improved,04 TO 06,Y,Y,N,N,N,N,6
2,16.0,68.923077,2022,675,1.303489,103.864529,KALLANG/WHAMPOA,3 ROOM,Improved,07 TO 09,Y,Y,N,N,N,N,6
3,16.0,68.000000,2022,675,1.303489,103.864529,KALLANG/WHAMPOA,3 ROOM,Improved,10 TO 12,Y,Y,N,N,N,N,6
4,16.0,68.000000,2022,675,1.303489,103.864529,KALLANG/WHAMPOA,3 ROOM,Improved,13 TO 15,Y,Y,N,N,N,N,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59925,40.0,112.000000,2022,1111,1.286639,103.829010,BUKIT MERAH,5 ROOM,Improved,04 TO 06,Y,N,N,Y,N,N,6
59926,40.0,92.000000,2022,1111,1.286639,103.829010,BUKIT MERAH,4 ROOM,Model A,07 TO 09,Y,N,N,Y,N,N,6
59927,40.0,112.000000,2022,1111,1.286639,103.829010,BUKIT MERAH,5 ROOM,Improved,07 TO 09,Y,N,N,Y,N,N,6
59928,40.0,112.000000,2022,1111,1.286639,103.829010,BUKIT MERAH,5 ROOM,Improved,10 TO 12,Y,N,N,Y,N,N,6


In [8]:
def get_prediction(model_fp, features):
    with open(model_fp, "rb") as pklfile:
        model_pipeline = pickle.load(pklfile)
        
    preds = model_pipeline.predict(features)
    return preds

In [10]:
all_preds=[]
for num_fold, model_fp in enumerate(model_fps):
    print("Predicting for fold: ", num_fold)
    all_preds.append(get_prediction(model_fp, test_features))

Predicting for fold:  0
Predicting for fold:  1
Predicting for fold:  2
Predicting for fold:  3
Predicting for fold:  4
Predicting for fold:  5
Predicting for fold:  6
Predicting for fold:  7
Predicting for fold:  8
Predicting for fold:  9


In [16]:
mean_pred = np.mean(all_preds, axis=0)
test_data['predicted_resale_price'] = mean_pred

In [17]:
test_data.to_csv("../data/predicted_resale_price.csv",index=False)