## As we want to use sklearn for a end to end process, we will be creating our custom transformer for preprocessing

In [47]:
#sklearn libraries
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin, BaseEstimator

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy import stats

import pickle

In [2]:
raw_df = pd.read_csv("../data/raw_df.csv")
raw_df.head(3)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,...,year_completed,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,total_dwelling_units,latitude,longitude
0,2015-01,ANG MO KIO,3 ROOM,174,ANG MO KIO AVE 4,07 TO 09,60.0,Improved,1986,70,...,1980.0,Y,N,N,N,N,N,198.0,1.375298,103.837357
1,2015-01,ANG MO KIO,3 ROOM,541,ANG MO KIO AVE 10,01 TO 03,68.0,New Generation,1981,65,...,1979.0,Y,N,N,Y,N,N,191.0,1.37412,103.855498
2,2015-01,ANG MO KIO,3 ROOM,163,ANG MO KIO AVE 4,01 TO 03,69.0,New Generation,1980,64,...,1981.0,Y,Y,N,N,N,N,84.0,1.373846,103.838589


In [3]:
raw_df['flat_model'].value_counts()

Model A                   62910
Improved                  51721
New Generation            30668
Premium Apartment         21036
Simplified                 9442
Apartment                  8144
Standard                   6554
Maisonette                 5915
Model A2                   2717
DBSS                       2438
Model A-Maisonette          349
Adjoined flat               348
Type S1                     342
Type S2                     173
Terrace                     128
Multi Generation             84
Premium Apartment Loft       77
Improved-Maisonette          28
Premium Maisonette           17
2-room                       14
Name: flat_model, dtype: int64

In [4]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 203105 entries, 0 to 203104
Data columns (total 25 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   month                 203105 non-null  object 
 1   town                  203105 non-null  object 
 2   flat_type             203105 non-null  object 
 3   block                 203105 non-null  object 
 4   street_name           203105 non-null  object 
 5   storey_range          203105 non-null  object 
 6   floor_area_sqm        203105 non-null  float64
 7   flat_model            203105 non-null  object 
 8   lease_commence_date   203105 non-null  int64  
 9   remaining_lease       203105 non-null  object 
 10  resale_price          203105 non-null  float64
 11  address               203105 non-null  object 
 12  blk_no                203105 non-null  object 
 13  street                203105 non-null  object 
 14  max_floor_lvl         203105 non-null  float64
 15  

## Flat_type category

In [5]:
display(raw_df['flat_type'].unique())
class FlatTypeToCat(TransformerMixin, BaseEstimator):
    def __init__(self, flat_type_col='flat_type'):
        self.interested_flat_types = ['3 ROOM', '4 ROOM', '5 ROOM','EXECUTIVE']
        self.flat_type_col = flat_type_col
        
    def fit(self,df, y=None):
        return self
    
    def transform(self, df, y=None):
        df = df[df[self.flat_type_col].isin(self.interested_flat_types)]
        return df

array(['3 ROOM', '4 ROOM', '5 ROOM', '2 ROOM', 'EXECUTIVE', '1 ROOM',
       'MULTI-GENERATION'], dtype=object)

## Changing sale_date to sale_year, sale_month

In [6]:
class SaleDate2Yearmonth(TransformerMixin, BaseEstimator):
    def __init__(self, sale_date_col='month'):
        self.sale_date_col = sale_date_col
        
    def fit(self, df, y=None):
        return self
    
    def transform(self, df, y=None):
        df = df.copy()
        sale_date = pd.to_datetime(df[self.sale_date_col])
        df['sale_month'] = sale_date.dt.month
        df['sale_year'] = sale_date.dt.year
        df = df.drop(self.sale_date_col, axis=1)
        return df

## Categorize storey_range

In [7]:
display(raw_df['storey_range'].unique())
raw_df['storey_range'].value_counts()

array(['07 TO 09', '01 TO 03', '13 TO 15', '10 TO 12', '04 TO 06',
       '19 TO 21', '16 TO 18', '22 TO 24', '25 TO 27', '28 TO 30',
       '34 TO 36', '46 TO 48', '31 TO 33', '37 TO 39', '43 TO 45',
       '40 TO 42', '49 TO 51', '06 TO 10', '01 TO 05', '11 TO 15',
       '16 TO 20', '21 TO 25', '26 TO 30', '36 TO 40', '31 TO 35'],
      dtype=object)

04 TO 06    46754
07 TO 09    41883
01 TO 03    37021
10 TO 12    36614
13 TO 15    17344
16 TO 18     7668
19 TO 21     3284
01 TO 05     2699
06 TO 10     2474
22 TO 24     2325
25 TO 27     1270
11 TO 15     1259
28 TO 30      800
31 TO 33      364
34 TO 36      355
37 TO 39      334
16 TO 20      265
40 TO 42      166
21 TO 25       92
43 TO 45       39
26 TO 30       39
46 TO 48       35
49 TO 51       12
36 TO 40        7
31 TO 35        2
Name: storey_range, dtype: int64

In [8]:
class StoreyRangeCat(TransformerMixin, BaseEstimator):
    def __init__(self, storey_range_col='storey_range'):
        self.storey_range_col = storey_range_col
        self.interested_stories=["01 TO 03", "04 TO 06", "07 TO 09", "10 TO 12", "13 TO 15","16 TO 18"]
        
    def fit(self,X, y=None):
        return self
    
    def transform(self, df, y=None):
        df = df[df[self.storey_range_col].isin(self.interested_stories)]
        return df     

## remaining lease to remaining lease months

In [9]:
class RemainingLease(TransformerMixin, BaseEstimator):
    def __init__(self, remaining_lease_col='remaining_lease'):
        self.remaining_lease_col = remaining_lease_col
        
    def fit(self,X, y=None):
        return self
    
    def __years_months_to_months(self,year_month_list):
        try:
            year_month_list = [item for item in year_month_list if item] # remove none
        except Exception as e:
            print(year_month_list)
            return np.nan

 
        if len(year_month_list)==1:
            try:
                months = int(year_month_list[0]) *12
                return months
            except Exception:
                return None
            
        total_months = 0    
        if ("months" in year_month_list) or ("month" in year_month_list):
            total_months += int(year_month_list[-2])
        if  ("years" in year_month_list) or ("year" in year_month_list):
            total_months += int(year_month_list[0]) * 12
        return total_months
    
    def transform(self, df, y=None):
        lease_months = df[self.remaining_lease_col].astype(str).str.split(r"(years*)|(months*)")
        df['remaining_lease_months'] = lease_months.map(self.__years_months_to_months).astype(int)
        df = df.drop('remaining_lease', axis=1)
        return df
        

## Columns to drop

In [10]:
class DropColumns(TransformerMixin, BaseEstimator):
    def __init__(self, drop_columns=['block', "street_name", "lease_commence_date", "address","blk_no","street","year_completed", "total_dwelling_units"]):
        self.drop_columns = drop_columns
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,df, y=None):
        return df.drop(self.drop_columns, axis=1)

## Full_preprocessing_pipeline

In [11]:
preprocessing_pipeline = Pipeline(steps=[
    ('cvt_flat_type',FlatTypeToCat(flat_type_col="flat_type")),
    ('cvt_sale_date',SaleDate2Yearmonth(sale_date_col="month")),
    ('cvt_storey_range',StoreyRangeCat(storey_range_col='storey_range')),
    ("cvt_remaining_lease", RemainingLease(remaining_lease_col='remaining_lease')),
    ("drop_columns", DropColumns())
    
])



processed_df = preprocessing_pipeline.fit_transform(raw_df)
#removing noisy data for training purposes
processed_df = processed_df[processed_df["remaining_lease_months"]>5]
processed_df = processed_df[processed_df["remaining_lease_months"]<1187]
processed_df

Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,resale_price,max_floor_lvl,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,latitude,longitude,sale_month,sale_year,remaining_lease_months
0,ANG MO KIO,3 ROOM,07 TO 09,60.0,Improved,255000.0,11.0,Y,N,N,N,N,N,1.375298,103.837357,1,2015,840
1,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,275000.0,8.0,Y,N,N,Y,N,N,1.374120,103.855498,1,2015,780
2,ANG MO KIO,3 ROOM,01 TO 03,69.0,New Generation,285000.0,4.0,Y,Y,N,N,N,N,1.373846,103.838589,1,2015,768
3,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,290000.0,4.0,Y,Y,N,N,N,N,1.367762,103.855301,1,2015,756
4,ANG MO KIO,3 ROOM,07 TO 09,68.0,New Generation,290000.0,13.0,Y,N,N,Y,N,N,1.371618,103.857786,1,2015,768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203100,YISHUN,5 ROOM,10 TO 12,122.0,Improved,580000.0,12.0,Y,N,N,N,N,N,1.411716,103.833275,12,2014,876
203101,YISHUN,EXECUTIVE,10 TO 12,146.0,Maisonette,540000.0,13.0,Y,N,N,Y,N,N,1.429345,103.842030,12,2014,876
203102,YISHUN,EXECUTIVE,07 TO 09,164.0,Apartment,738000.0,13.0,Y,Y,N,N,N,N,1.418769,103.835642,12,2014,924
203103,YISHUN,EXECUTIVE,07 TO 09,152.0,Maisonette,592000.0,11.0,Y,N,N,Y,N,N,1.437902,103.836960,12,2014,840


## Exporting data for training

In [84]:
processed_df.to_csv("../data/processed_df.csv")

## Exporting preprocessing artifacts

Reloading to check if the artifacts are saved correctly

In [85]:
with open("../artifacts/preprocessing_pipeline.pkl", "wb") as pklfile:
    pickle.dump(preprocessing_pipeline,pklfile,protocol=pickle.HIGHEST_PROTOCOL)

In [87]:
with open("../artifacts/preprocessing_pipeline.pkl", "rb") as pklfile:
    preprocessing_pipeline = pickle.load(pklfile)


raw_df = pd.read_csv("../data/raw_df.csv")
raw_df.head(3)

raw_df= preprocessing_pipeline.transform(raw_df)
raw_df

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,resale_price,max_floor_lvl,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,latitude,longitude,sale_month,sale_year,remaining_lease_months
0,ANG MO KIO,3 ROOM,07 TO 09,60.0,Improved,255000.0,11.0,Y,N,N,N,N,N,1.375298,103.837357,1,2015,840
1,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,275000.0,8.0,Y,N,N,Y,N,N,1.374120,103.855498,1,2015,780
2,ANG MO KIO,3 ROOM,01 TO 03,69.0,New Generation,285000.0,4.0,Y,Y,N,N,N,N,1.373846,103.838589,1,2015,768
3,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,290000.0,4.0,Y,Y,N,N,N,N,1.367762,103.855301,1,2015,756
4,ANG MO KIO,3 ROOM,07 TO 09,68.0,New Generation,290000.0,13.0,Y,N,N,Y,N,N,1.371618,103.857786,1,2015,768
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203100,YISHUN,5 ROOM,10 TO 12,122.0,Improved,580000.0,12.0,Y,N,N,N,N,N,1.411716,103.833275,12,2014,876
203101,YISHUN,EXECUTIVE,10 TO 12,146.0,Maisonette,540000.0,13.0,Y,N,N,Y,N,N,1.429345,103.842030,12,2014,876
203102,YISHUN,EXECUTIVE,07 TO 09,164.0,Apartment,738000.0,13.0,Y,Y,N,N,N,N,1.418769,103.835642,12,2014,924
203103,YISHUN,EXECUTIVE,07 TO 09,152.0,Maisonette,592000.0,11.0,Y,N,N,Y,N,N,1.437902,103.836960,12,2014,840


## Generating prediction data 
For the final app, we will generate unique entries

In [122]:
# we will retain for columns for the test data so we can show more information in the UI
class DropColumnsForTest(TransformerMixin, BaseEstimator):
    def __init__(self, drop_columns=['block', "street_name", "lease_commence_date","year_completed", "total_dwelling_units",
                                     "resale_price"]):
        self.drop_columns = drop_columns
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,df, y=None):
        return df.drop(self.drop_columns, axis=1)

In [131]:
preprocessing_for_test_pipeline = Pipeline(steps=[
    ('cvt_flat_type',FlatTypeToCat(flat_type_col="flat_type")),
    ('cvt_sale_date',SaleDate2Yearmonth(sale_date_col="month")),
    ('cvt_storey_range',StoreyRangeCat(storey_range_col='storey_range')),
    ("cvt_remaining_lease", RemainingLease(remaining_lease_col='remaining_lease')),
    ("drop_columns", DropColumnsForTest())
])


test_df = pd.read_csv("../data/raw_df.csv")
test_df = preprocessing_for_test_pipeline.fit_transform(raw_df)
#removing noisy data for training purposes
test_df = test_df[test_df["remaining_lease_months"]>5]
test_df = test_df[test_df["remaining_lease_months"]<1187]
test_df.head()

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,address,blk_no,street,max_floor_lvl,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,latitude,longitude,sale_month,sale_year,remaining_lease_months
0,ANG MO KIO,3 ROOM,07 TO 09,60.0,Improved,174 ANG MO KIO AVE 4,174,ANG MO KIO AVE 4,11.0,Y,N,N,N,N,N,1.375298,103.837357,1,2015,840
1,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,541 ANG MO KIO AVE 10,541,ANG MO KIO AVE 10,8.0,Y,N,N,Y,N,N,1.37412,103.855498,1,2015,780
2,ANG MO KIO,3 ROOM,01 TO 03,69.0,New Generation,163 ANG MO KIO AVE 4,163,ANG MO KIO AVE 4,4.0,Y,Y,N,N,N,N,1.373846,103.838589,1,2015,768
3,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,446 ANG MO KIO AVE 10,446,ANG MO KIO AVE 10,4.0,Y,Y,N,N,N,N,1.367762,103.855301,1,2015,756
4,ANG MO KIO,3 ROOM,07 TO 09,68.0,New Generation,557 ANG MO KIO AVE 10,557,ANG MO KIO AVE 10,13.0,Y,N,N,Y,N,N,1.371618,103.857786,1,2015,768


In [132]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 184654 entries, 0 to 203104
Data columns (total 20 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   town                    184654 non-null  object 
 1   flat_type               184654 non-null  object 
 2   storey_range            184654 non-null  object 
 3   floor_area_sqm          184654 non-null  float64
 4   flat_model              184654 non-null  object 
 5   address                 184654 non-null  object 
 6   blk_no                  184654 non-null  object 
 7   street                  184654 non-null  object 
 8   max_floor_lvl           184654 non-null  float64
 9   residential             184654 non-null  object 
 10  commercial              184654 non-null  object 
 11  market_hawker           184654 non-null  object 
 12  miscellaneous           184654 non-null  object 
 13  multistorey_carpark     184654 non-null  object 
 14  precinct_pavilion   

## Adjusting remaining lease to prediction date

In [133]:
# adjusting 
prediction_sale_year = 2022
prediction_sale_month = 6

test_df['remaining_lease_months'] = (test_df['remaining_lease_months'] 
                                     - (prediction_sale_year-test_df['sale_year'])*12 
                                     - (prediction_sale_month-test_df['sale_month']))

test_df['sale_year'] = prediction_sale_year
test_df['sale_month'] = prediction_sale_month

## Standardizing the remaining leasing date

In [142]:
def mode(df):
    return df['remaining_lease_months'].mode()[0]
remaining_leasing_data = test_df[['address','remaining_lease_months']].groupby('address').apply(mode)

In [147]:
for address in remaining_leasing_data.index:
    test_df.loc[test_df['address']==address,'remaining_lease_months']=remaining_leasing_data[address]

In [148]:
test_df

Unnamed: 0,town,flat_type,storey_range,floor_area_sqm,flat_model,address,blk_no,street,max_floor_lvl,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,latitude,longitude,sale_month,sale_year,remaining_lease_months
0,ANG MO KIO,3 ROOM,07 TO 09,60.0,Improved,174 ANG MO KIO AVE 4,174,ANG MO KIO AVE 4,11.0,Y,N,N,N,N,N,1.375298,103.837357,6,2022,751
1,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,541 ANG MO KIO AVE 10,541,ANG MO KIO AVE 10,8.0,Y,N,N,Y,N,N,1.374120,103.855498,6,2022,691
2,ANG MO KIO,3 ROOM,01 TO 03,69.0,New Generation,163 ANG MO KIO AVE 4,163,ANG MO KIO AVE 4,4.0,Y,Y,N,N,N,N,1.373846,103.838589,6,2022,688
3,ANG MO KIO,3 ROOM,01 TO 03,68.0,New Generation,446 ANG MO KIO AVE 10,446,ANG MO KIO AVE 10,4.0,Y,Y,N,N,N,N,1.367762,103.855301,6,2022,674
4,ANG MO KIO,3 ROOM,07 TO 09,68.0,New Generation,557 ANG MO KIO AVE 10,557,ANG MO KIO AVE 10,13.0,Y,N,N,Y,N,N,1.371618,103.857786,6,2022,679
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203100,YISHUN,5 ROOM,10 TO 12,122.0,Improved,816 YISHUN ST 81,816,YISHUN ST 81,12.0,Y,N,N,N,N,N,1.411716,103.833275,6,2022,781
203101,YISHUN,EXECUTIVE,10 TO 12,146.0,Maisonette,325 YISHUN CTRL,325,YISHUN CTRL,13.0,Y,N,N,Y,N,N,1.429345,103.842030,6,2022,785
203102,YISHUN,EXECUTIVE,07 TO 09,164.0,Apartment,618 YISHUN RING RD,618,YISHUN RING RD,13.0,Y,Y,N,N,N,N,1.418769,103.835642,6,2022,828
203103,YISHUN,EXECUTIVE,07 TO 09,152.0,Maisonette,277 YISHUN ST 22,277,YISHUN ST 22,11.0,Y,N,N,Y,N,N,1.437902,103.836960,6,2022,746


In [156]:
def aggreate_unique_block(df):
    mode_cols = ['town','blk_no','street','max_floor_lvl','residential','commercial','market_hawker',
                 'miscellaneous', 'multistorey_carpark','precinct_pavilion','remaining_lease_months','sale_month','sale_year']
    mean_cols = ["floor_area_sqm", "latitude","longitude"]
    
    modes_series = df[mode_cols].mode()
    mean_series = pd.DataFrame(df[mean_cols].mean()).T
    return pd.concat([modes_series,mean_series],axis=1)
    

In [157]:
unique_test_df = test_df.groupby(['address', 'storey_range','flat_type','flat_model']).apply(aggreate_unique_block)

In [182]:
unique_test_df=unique_test_df.reset_index()
unique_test_df = unique_test_df.dropna()
unique_test_df = unique_test_df.drop('level_4', axis=1)
unique_test_df.to_csv("../data/unique_block_test_df.csv")

In [183]:
unique_test_df

Unnamed: 0,address,storey_range,flat_type,flat_model,town,blk_no,street,max_floor_lvl,residential,commercial,market_hawker,miscellaneous,multistorey_carpark,precinct_pavilion,remaining_lease_months,sale_month,sale_year,floor_area_sqm,latitude,longitude
0,1 BEACH RD,01 TO 03,3 ROOM,Improved,KALLANG/WHAMPOA,1,BEACH RD,16.0,Y,Y,N,N,N,N,675,6,2022,64.500000,1.303489,103.864529
1,1 BEACH RD,04 TO 06,3 ROOM,Improved,KALLANG/WHAMPOA,1,BEACH RD,16.0,Y,Y,N,N,N,N,675,6,2022,71.600000,1.303489,103.864529
2,1 BEACH RD,07 TO 09,3 ROOM,Improved,KALLANG/WHAMPOA,1,BEACH RD,16.0,Y,Y,N,N,N,N,675,6,2022,68.923077,1.303489,103.864529
3,1 BEACH RD,10 TO 12,3 ROOM,Improved,KALLANG/WHAMPOA,1,BEACH RD,16.0,Y,Y,N,N,N,N,675,6,2022,68.000000,1.303489,103.864529
4,1 BEACH RD,13 TO 15,3 ROOM,Improved,KALLANG/WHAMPOA,1,BEACH RD,16.0,Y,Y,N,N,N,N,675,6,2022,68.000000,1.303489,103.864529
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59925,9B BOON TIONG RD,04 TO 06,5 ROOM,Improved,BUKIT MERAH,9B,BOON TIONG RD,40.0,Y,N,N,Y,N,N,1111,6,2022,112.000000,1.286639,103.829010
59926,9B BOON TIONG RD,07 TO 09,4 ROOM,Model A,BUKIT MERAH,9B,BOON TIONG RD,40.0,Y,N,N,Y,N,N,1111,6,2022,92.000000,1.286639,103.829010
59927,9B BOON TIONG RD,07 TO 09,5 ROOM,Improved,BUKIT MERAH,9B,BOON TIONG RD,40.0,Y,N,N,Y,N,N,1111,6,2022,112.000000,1.286639,103.829010
59928,9B BOON TIONG RD,10 TO 12,5 ROOM,Improved,BUKIT MERAH,9B,BOON TIONG RD,40.0,Y,N,N,Y,N,N,1111,6,2022,112.000000,1.286639,103.829010


## Generating the unqiue block information

In [187]:
block_information = unique_test_df[['address','town','blk_no','street','max_floor_lvl','latitude','longitude']]
block_information = block_information.drop_duplicates("address")
block_information.to_csv("../data/block_information_w_coordinates.csv")

## Generating recent sale information

In [213]:


new_col_name =['address','sale_date','storey_range','flat_type','flat_model','floor_area_sqm','resale_price','remaining_lease']

recent_sale_df = raw_df[['address','month','storey_range','flat_type','flat_model','floor_area_sqm','resale_price','remaining_lease']]
recent_sale_df.columns = new_col_name
recent_sale_df = recent_sale_df.sort_values(['address',"sale_date", "storey_range","flat_type","flat_model"], 
                                            ascending=[True,False,True,True,True]).reset_index(drop=True)
recent_sale_df.loc[recent_sale_df['remaining_lease'].astype(str).str.isdigit(), "remaining_lease"] = recent_sale_df[recent_sale_df['remaining_lease'].astype(str).str.isdigit()]['remaining_lease'].astype(str) + " years"
recent_sale_df.to_csv("../data/recent_sale_df.csv")