In [1]:
import pandas as pd
import numpy as np
import os, re, random
import warnings, json
warnings.filterwarnings('ignore')
from collections import defaultdict

# PLEASE change your data folder here manually if you need
current_folder = os.path.abspath(os.curdir)
root_folder = os.path.dirname(current_folder) 
data_folder = os.path.join(root_folder,'data')

%matplotlib inline

def set_seeds(seed):
    # for reproducibility
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
set_seeds(1234)


In [2]:
train_df = pd.read_csv(os.path.join(data_folder,'train.csv'))
# test_df = pd.read_csv(os.path.join(data_folder,'test.csv'))
train_df.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'eco_category', 'lease_commence_date',
       'latitude', 'longitude', 'elevation', 'subzone', 'planning_area',
       'region', 'resale_price'],
      dtype='object')

In [3]:
auxiliary_paths = {
    'commercial':'sg-commerical-centres.csv',
    'hawker': 'sg-gov-markets-hawker-centres.csv',
    'demographics': 'sg-population-demographics.csv',
    'prisch': 'sg-primary-schools.csv',
    'secsch': 'sg-secondary-schools.csv',
    'malls': 'sg-shopping-malls.csv',
    'station': 'sg-train-stations.csv'
}

In [4]:
aux = 'demographics'
print(f'Opening auxiliary data bout "{aux}"...')
aux_df = pd.read_csv(os.path.join(os.path.join(data_folder, 'auxiliary-data'), auxiliary_paths[aux]))
aux_df.head()

Opening auxiliary data bout "demographics"...


Unnamed: 0,plannin_area,subzone,age_group,sex,count
0,ang mo kio,ang mo kio town centre,0-4,m,130
1,ang mo kio,cheng san,0-4,m,670
2,ang mo kio,chong boon,0-4,m,460
3,ang mo kio,kebun bahru,0-4,m,380
4,ang mo kio,sembawang hills,0-4,m,90


In [5]:
conv_dict = {
    'kids': ['0-4', '5-9', '10-14'],                      # dependents
    'youth': ['15-19', '20-24'],                          # students/ part-timers
    'youngads': ['25-29', '30-34', '35-39'],              # young families
    'middle': ['40-44', '45-49', '50-54'],                # older families
    'older': ['55-59', '60-64'],                          # retirees
    'elderly': ['65-69', '70-74','75-79', '80-84', '85+'] # older group
}

In [6]:
rev_dict = {}

for k,v in conv_dict.items():
    for i in v:
        rev_dict[i] = k
        
rev_dict

{'0-4': 'kids',
 '5-9': 'kids',
 '10-14': 'kids',
 '15-19': 'youth',
 '20-24': 'youth',
 '25-29': 'youngads',
 '30-34': 'youngads',
 '35-39': 'youngads',
 '40-44': 'middle',
 '45-49': 'middle',
 '50-54': 'middle',
 '55-59': 'older',
 '60-64': 'older',
 '65-69': 'elderly',
 '70-74': 'elderly',
 '75-79': 'elderly',
 '80-84': 'elderly',
 '85+': 'elderly'}

In [7]:
aux_df['age_grp'] = aux_df['age_group'].apply(lambda x: rev_dict[x])
aux_df.head()

Unnamed: 0,plannin_area,subzone,age_group,sex,count,age_grp
0,ang mo kio,ang mo kio town centre,0-4,m,130,kids
1,ang mo kio,cheng san,0-4,m,670,kids
2,ang mo kio,chong boon,0-4,m,460,kids
3,ang mo kio,kebun bahru,0-4,m,380,kids
4,ang mo kio,sembawang hills,0-4,m,90,kids


In [8]:
_aux_df = aux_df.groupby(['subzone', 'age_grp'])['count'].sum().unstack('age_grp').reset_index()
_aux_df

age_grp,subzone,elderly,kids,middle,older,youngads,youth
0,admiralty,1130.0,3120.0,3500.0,1390.0,3790.0,1480.0
1,alexandra hill,2990.0,2010.0,3440.0,2450.0,2950.0,1810.0
2,alexandra north,60.0,190.0,280.0,90.0,280.0,100.0
3,aljunied,6430.0,5320.0,10160.0,6240.0,8970.0,4580.0
4,anak bukit,3000.0,3230.0,5130.0,3350.0,4020.0,3280.0
...,...,...,...,...,...,...,...
226,yishun south,4200.0,5170.0,9340.0,5940.0,8730.0,5460.0
227,yishun west,6440.0,7790.0,14360.0,8840.0,13010.0,7450.0
228,yuhua east,3740.0,3540.0,6270.0,4440.0,6170.0,3320.0
229,yuhua west,2580.0,2870.0,4870.0,3310.0,4520.0,2760.0


In [9]:
sum(_aux_df.duplicated(subset='subzone'))

0

In [10]:
df_x_aux = pd.merge(train_df, _aux_df, how='left', on='subzone').iloc[:,-6:]
df_x_aux.columns = [aux+'_'+i for i in df_x_aux.columns]

In [24]:
train_df[df_x_aux.isnull().any(axis=1)]['subzone'].unique()

array(['gali batu', 'city hall'], dtype=object)

In [25]:
aux_df['subzone'].unique()

array(['ang mo kio town centre', 'cheng san', 'chong boon', 'kebun bahru',
       'sembawang hills', 'shangri-la', 'tagore', 'townsville',
       'yio chu kang east', 'yio chu kang west', 'bayshore',
       'bedok north', 'bedok reservoir', 'bedok south', 'frankel',
       'kaki bukit', 'kembangan', 'siglap', 'bishan east', 'marymount',
       'upper thomson', 'bukit batok central', 'bukit batok east',
       'bukit batok south', 'bukit batok west', 'gombak', 'guilin',
       'hillview', 'hong kah north', 'alexandra hill', 'alexandra north',
       'bukit ho swee', 'bukit merah', 'depot road', 'everton park',
       'henderson hill', 'kampong tiong bahru', 'maritime square',
       'redhill', 'telok blangah drive', 'telok blangah rise',
       'telok blangah way', 'tiong bahru', 'tiong bahru station',
       'bangkit', 'dairy farm', 'fajar', 'jelebu', 'nature reserve',
       'saujana', 'senja', 'anak bukit', 'coronation road',
       'farrer court', 'hillcrest', 'holland road', 'leedo

In [11]:
print(df_x_aux.columns[df_x_aux.isnull().any()])

Index(['demographics_elderly', 'demographics_kids', 'demographics_middle',
       'demographics_older', 'demographics_youngads', 'demographics_youth'],
      dtype='object')


In [15]:
train_df

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,2001-08,pasir ris,4 room,440,pasir ris drive 4,01 to 03,118.0,model a,uncategorized,1989,1.369008,103.958697,0.0,pasir ris drive,pasir ris,east region,209700.0
1,2014-10,punggol,5-room,196B,punggol field,10 to 12,110.0,improved,uncategorized,2003,1.399007,103.906991,0.0,punggol field,punggol,north-east region,402300.0
2,2020-09,sengkang,5 room,404A,fernvale lane,01 to 03,112.0,premium apartment,uncategorized,2004,1.388348,103.873815,0.0,fernvale,sengkang,north-east region,351000.0
3,2000-10,clementi,3 room,375,clementi avenue 4,07 to 09,67.0,new generation,uncategorized,1980,1.318493,103.766702,0.0,clementi north,clementi,west region,151200.0
4,2013-01,bukit batok,3-room,163,bukit batok street 11,07 to 09,73.0,model a,uncategorized,1985,1.348149,103.742658,0.0,bukit batok west,bukit batok,west region,318600.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,2005-03,woodlands,4 room,537,Woodlands Drive 16,01 to 03,101.0,model a,uncategorized,2000,1.429658,103.792583,0.0,woodlands south,woodlands,north region,238500.0
431728,2016-04,sengkang,4 room,410A,fernvale road,13 to 15,95.0,premium apartment,uncategorized,2012,1.390053,103.875941,0.0,fernvale,sengkang,north-east region,376200.0
431729,2011-01,tampines,3-room,829,tampines street 81,01 to 03,67.0,new generation,uncategorized,1986,1.349224,103.934913,0.0,tampines west,tampines,east region,255600.0
431730,2013-05,sengkang,5-room,233,compassvale walk,16 to 18,123.0,improved,uncategorized,1999,1.389941,103.900721,0.0,sengkang town centre,sengkang,north-east region,508500.0


In [14]:
train_df[df_x_aux.isna()]

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,eco_category,lease_commence_date,latitude,longitude,elevation,subzone,planning_area,region,resale_price
0,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
431727,,,,,,,,,,,,,,,,,
431728,,,,,,,,,,,,,,,,,
431729,,,,,,,,,,,,,,,,,
431730,,,,,,,,,,,,,,,,,
