In [1]:
import pandas as pd
import numpy as np
import os, re, random
import warnings, json
warnings.filterwarnings('ignore')
from collections import defaultdict

# PLEASE change your data folder here manually if you need
current_folder = os.path.abspath(os.curdir)
root_folder = os.path.dirname(current_folder) 
data_folder = os.path.join(root_folder,'data')

%matplotlib inline

def set_seeds(seed):
    # for reproducibility
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    
set_seeds(1234)


In [2]:
train_df = pd.read_csv(os.path.join(data_folder,'train.csv'))
# test_df = pd.read_csv(os.path.join(data_folder,'test.csv'))
train_df.columns

Index(['month', 'town', 'flat_type', 'block', 'street_name', 'storey_range',
       'floor_area_sqm', 'flat_model', 'eco_category', 'lease_commence_date',
       'latitude', 'longitude', 'elevation', 'subzone', 'planning_area',
       'region', 'resale_price'],
      dtype='object')

In [3]:
auxiliary_paths = {
    'commercial':'sg-commerical-centres.csv',
    'hawker': 'sg-gov-markets-hawker-centres.csv',
    'demographics': 'sg-population-demographics.csv',
    'prisch': 'sg-primary-schools.csv',
    'secsch': 'sg-secondary-schools.csv',
    'malls': 'sg-shopping-malls.csv',
    'station': 'sg-train-stations.csv'
}

In [4]:
aux = 'demographics'
print(f'Opening auxiliary data bout "{aux}"...')
aux_df = pd.read_csv(os.path.join(os.path.join(data_folder, 'auxiliary-data'), auxiliary_paths[aux]))
aux_df.head()

Opening auxiliary data bout "demographics"...


Unnamed: 0,plannin_area,subzone,age_group,sex,count
0,ang mo kio,ang mo kio town centre,0-4,m,130
1,ang mo kio,cheng san,0-4,m,670
2,ang mo kio,chong boon,0-4,m,460
3,ang mo kio,kebun bahru,0-4,m,380
4,ang mo kio,sembawang hills,0-4,m,90


In [11]:
conv_dict = {
    'kids': ['0-4', '5-9', '10-14'],                      # dependents
    'youth': ['15-19', '20-24'],                          # students/ part-timers
    'youngads': ['25-29', '30-34', '35-39'],              # young families
    'middle': ['40-44', '45-49', '50-54'],                # older families
    'older': ['55-59', '60-64'],                          # retirees
    'elderly': ['65-69', '70-74','75-79', '80-84', '85+'] # older group
}

In [12]:
rev_dict = {}

for k,v in conv_dict.items():
    for i in v:
        rev_dict[i] = k
        
rev_dict

{'0-4': 'kids',
 '5-9': 'kids',
 '10-14': 'kids',
 '15-19': 'youth',
 '20-24': 'youth',
 '25-29': 'youngads',
 '30-34': 'youngads',
 '35-39': 'youngads',
 '40-44': 'middle',
 '45-49': 'middle',
 '50-54': 'middle',
 '55-59': 'older',
 '60-64': 'older',
 '65-69': 'elderly',
 '70-74': 'elderly',
 '75-79': 'elderly',
 '80-84': 'elderly',
 '85+': 'elderly'}

In [13]:
aux_df['age_grp'] = aux_df['age_group'].apply(lambda x: rev_dict[x])
aux_df.head()

Unnamed: 0,plannin_area,subzone,age_group,sex,count,age_grp
0,ang mo kio,ang mo kio town centre,0-4,m,130,kids
1,ang mo kio,cheng san,0-4,m,670,kids
2,ang mo kio,chong boon,0-4,m,460,kids
3,ang mo kio,kebun bahru,0-4,m,380,kids
4,ang mo kio,sembawang hills,0-4,m,90,kids


In [22]:
_aux_df = aux_df.groupby(['subzone', 'age_grp'])['count'].sum().unstack('age_grp').reset_index()
_aux_df

age_grp,subzone,elderly,kids,middle,older,youngads,youth
0,admiralty,1130.0,3120.0,3500.0,1390.0,3790.0,1480.0
1,alexandra hill,2990.0,2010.0,3440.0,2450.0,2950.0,1810.0
2,alexandra north,60.0,190.0,280.0,90.0,280.0,100.0
3,aljunied,6430.0,5320.0,10160.0,6240.0,8970.0,4580.0
4,anak bukit,3000.0,3230.0,5130.0,3350.0,4020.0,3280.0
...,...,...,...,...,...,...,...
226,yishun south,4200.0,5170.0,9340.0,5940.0,8730.0,5460.0
227,yishun west,6440.0,7790.0,14360.0,8840.0,13010.0,7450.0
228,yuhua east,3740.0,3540.0,6270.0,4440.0,6170.0,3320.0
229,yuhua west,2580.0,2870.0,4870.0,3310.0,4520.0,2760.0


In [25]:
sum(_aux_df.duplicated(subset='subzone'))

0

In [28]:
df_x_aux = pd.merge(train_df, _aux_df, how='left', on='subzone').iloc[:,-6:]
df_x_aux.columns = [aux+'_'+i for i in df_x_aux.columns]

Index(['elderly', 'kids', 'middle', 'older', 'youngads', 'youth'], dtype='object')

['demographics_elderly',
 'demographics_kids',
 'demographics_middle',
 'demographics_older',
 'demographics_youngads',
 'demographics_youth']