In [1]:
import numpy as np
import pandas as pd

In [29]:
# loading the data set
train = pd.read_csv('train.csv')

# Feature engineering

## flat_type

In [30]:
train.loc[train['flat_type'] == "1-room", 'flat_type'] = "1 room"
train.loc[train['flat_type'] == "2-room", 'flat_type'] = "2 room"
train.loc[train['flat_type'] == "3-room", 'flat_type'] = "3 room"
train.loc[train['flat_type'] == "4-room", 'flat_type'] = "4 room"
train.loc[train['flat_type'] == "5-room", 'flat_type'] = "5 room"

# checking
flat_type = np.unique(train.flat_type) 
flat_type

array(['1 room', '2 room', '3 room', '4 room', '5 room', 'executive',
       'multi generation'], dtype=object)

## block 

In [31]:
# converting the block column to 1 if it has the number 4
# converting the block column to 0 if it does not have the number 4
train.loc[train['block'].str.contains('4'),'block'] = 1
train.loc[train['block'].str.contains('4') == False, 'block'] = 0

np.unique(train.block)

array([0, 1], dtype=object)

# storey_range

In [32]:
# convert to 01 to 06, 06 to 10, 10 to 15, 16 to 21, 21 to 25, 25 to 30, 
# 31 to 36, 36 to 40, 40 to 45, 46 to 51
# data is messy as it has lots of overlaps, so the partioning is to make
# it more systematic
# 01 to 06
train.loc[train['storey_range'] == "01 to 03", 'storey_range'] = "01 to 06"
train.loc[train['storey_range'] == "01 to 05", 'storey_range'] = "01 to 06"
train.loc[train['storey_range'] == "04 to 06", 'storey_range'] = "01 to 06"
# 06 to 10
train.loc[train['storey_range'] == "07 to 09", 'storey_range'] = "06 to 10"
# 10 to 15
train.loc[train['storey_range'] == "10 to 12", 'storey_range'] = "10 to 15"
train.loc[train['storey_range'] == "11 to 15", 'storey_range'] = "10 to 15"
train.loc[train['storey_range'] == "13 to 15", 'storey_range'] = "10 to 15"
# 16 to 21
train.loc[train['storey_range'] == "16 to 18", 'storey_range'] = "16 to 21"
train.loc[train['storey_range'] == "16 to 20", 'storey_range'] = "16 to 21"
train.loc[train['storey_range'] == "19 to 21", 'storey_range'] = "16 to 21"
# 21 to 25
train.loc[train['storey_range'] == "22 to 24", 'storey_range'] = "21 to 25"
# 25 to 30
train.loc[train['storey_range'] == "25 to 27", 'storey_range'] = "25 to 30"
train.loc[train['storey_range'] == "26 to 30", 'storey_range'] = "25 to 30"
train.loc[train['storey_range'] == "28 to 30", 'storey_range'] = "25 to 30"
# 31 to 36
train.loc[train['storey_range'] == "31 to 33", 'storey_range'] = "31 to 36"
train.loc[train['storey_range'] == "31 to 35", 'storey_range'] = "31 to 36"
train.loc[train['storey_range'] == "34 to 36", 'storey_range'] = "31 to 36"
# 36 to 40
train.loc[train['storey_range'] == "37 to 39", 'storey_range'] = "36 to 40"
# 40 to 45
train.loc[train['storey_range'] == "40 to 42", 'storey_range'] = "40 to 45"
train.loc[train['storey_range'] == "43 to 45", 'storey_range'] = "40 to 45"
# 46 to 51
train.loc[train['storey_range'] == "46 to 48", 'storey_range'] = "46 to 51"
train.loc[train['storey_range'] == "49 to 51", 'storey_range'] = "46 to 51"

# checking
np.unique(train.storey_range)

array(['01 to 06', '06 to 10', '10 to 15', '16 to 21', '21 to 25',
       '25 to 30', '31 to 36', '36 to 40', '40 to 45', '46 to 51'],
      dtype=object)

## Auxiliary- demographic

In [33]:
population_demo = pd.read_csv('auxiliary-data/sg-population-demographics.csv')

# population count across age in a particular subzone
dicts = {}
for area in np.unique(population_demo.subzone):
    area_count = population_demo[population_demo['subzone'] == area]['count'].sum()
    dicts[area] = area_count
train['popcount_subzone'] = train['subzone'].map(dicts)

# 490 was derived from central subzone in the population demographics
# dataset. However, there is no such subzone in the main dataset. After
# verifying it, central subzone is inferred to be 'city hall' in main 
# data set (beach road area)
train.loc[train['subzone'] == "city hall", 'popcount_subzone'] = 490

## street_name, subzone, flat_model, region, planning_area, & town

street_name, subzone and region remain as it is (raw).

Similiarly, I think no preprocessing is necessary for flat_model as these different models might have some impact on the resale price. For example maisonette is typically rare and so it might fetch a high price etc. 

The variable "town" is like a subset of the variable "planning_area". Two elements from the "town" set could be represented by either the planning area variable or other variables. More sepcifically, "Kallang/Whampoa" from the town set is essentially the same as "Kallang" from the planning area set. Also, "central area" from the town set could be captured by the variable "region". Thus, given these reasons, we have decided to drop "town" and just use planning_area.

The location variables can be seen as a hierarchy. It could be arranged from the bigger to smaller set - 1) region, 2) planning_area, 3) subzone, and finally 4) street_name. "region" covers a large portion of non-overlapping areas of Singapore, while street_name is on a smaller scale that is specific to a particular place. This hierachy could aid in the prediction as bigger umbrella sets (region & planning_area) tend to be more global and coarse while smaller specific sets (subzone & street_name) tend to be more local and have finer distinctions between locations. 

# One-hot encoding on the categorical columns before analysis

In [34]:
# implement one-hot encoding on categorical columns
# do note that pd.get_dummies drop the original variable column by
# default. Also note that "block" does not have to be one-hot encoded
# because it is a binary variable already.
# Also 'month' as Fiona mentioned, will be broken down to month and year
# so the month portion of 'month' should be categorical also. I have not
# included in, so please do so. 
categorical_cols = ['flat_type', 'street_name', 'storey_range', 
                    'flat_model', 'subzone','planning_area', 'region']
train_dummies = pd.get_dummies(train, columns = categorical_cols)

# selecting the data (note that 'month' is dropped as Fiona is doing)
train_y = train['resale_price']
train_final = train_dummies.drop(columns = ['town', 'eco_category', 'month',
                                            'elevation', 'resale_price','lease_commence_date'])

In [35]:
print('There are', len(train_final.columns), 'columns after prerpocessing.')
print('The total number of observations is', train_final.shape[0])

There are 1337 columns after prerpocessing.
The total number of observations is 431732


In [36]:
# saving the column names to csv
columns_gerard = train_final.columns
columns_gerard = pd.DataFrame(columns_gerard)
columns_gerard.to_csv('columns_gerard.csv',index=False, header=False)