## Packages and Reading Dataset

In [None]:
import pandas as pd
import numpy as np
import xlrd

In [None]:
df = pd.read_csv('../data/cleaned/resale_price_cleaned_3.csv')
df['year'] = pd.to_datetime(df['month']).dt.year
df['closest_year'] = df['year'].apply(lambda x: 2015 if abs(x - 2015) < abs(x - 2020) else 2020)

## Religion

In [None]:
df_religion_2015 = pd.read_csv('../data/raw/religion_2015.csv')
df_religion_2020 = pd.read_csv('../data/raw/religion_2020.csv')

df_religion_2015['year'] = 2015
df_religion_2020['year'] = 2020

df_religion_2015.rename(columns = {'Thousands':'town'}, inplace = True)
df_religion_2020.rename(columns = {'Number':'town'}, inplace = True)

columns_to_convert = df_religion_2015.columns.difference(['town', 'Total', 'year'])
df_religion_2015[columns_to_convert] = df_religion_2015[columns_to_convert].div(df_religion_2015['Total'], axis=0)

df_religion_2020['Sikhism'] = df_religion_2020['Sikhism'].str.replace("-", "0", case=False, regex=False)
columns_to_convert = df_religion_2020.columns.difference(['town', 'Total', 'year'])
df_religion_2020[columns_to_convert] = df_religion_2020[columns_to_convert].div(df_religion_2020['Total'], axis=0)

columns_to_int = df_religion_2020.columns.difference(['town'])
df_religion_2020[columns_to_int] = df_religion_2020[columns_to_int].astype(int)

df_religion_2015['town'] = df_religion_2015['town'].str.upper()
df_religion_2020['town'] = df_religion_2020['town'].str.upper()

df_merged_2015 = df[df['closest_year'] == 2015].merge(df_religion_2015, on='town', suffixes=('', '_religion'), how = 'left')
df_merged_2020 = df[df['closest_year'] == 2020].merge(df_religion_2020, on='town', suffixes=('', '_religion'), how = 'left')

df_final = pd.concat([df_merged_2015, df_merged_2020], ignore_index=True).drop(columns=['Total', 'year', 'year_religion'], inplace=True)
df_final

## Ethnicity

In [None]:
df_ethnicity_2015 = pd.read_excel('../data/raw/ethnicity_2015.xls', sheet_name='T8', skiprows=5).dropna(subset = ["Planning Area"])
df_ethnicity_2015 = df_ethnicity_2015[['Planning Area', 'Chinese', 'Malays', 'Indians', 'Others', 'Total']]

df_ethnicity_2015 = df_ethnicity_2015[df_ethnicity_2015['Planning Area'] != 'Planning Area']
df_ethnicity_2015.replace('-', np.nan, inplace=True).dropna(inplace=True)
df_ethnicity_2015['year'] = 2015
df_ethnicity_2015.rename(columns = {'Planning Area':'town'}, inplace = True)
columns_to_convert_ethnicity = df_ethnicity_2015.columns.difference(['town', 'Total', 'year'])
df_ethnicity_2015[columns_to_convert_ethnicity] = df_ethnicity_2015[columns_to_convert_ethnicity].div(df_ethnicity_2015['Total'], axis=0)
df_ethnicity_2015['town'] = df_ethnicity_2015['town'].str.upper()
df_merged_2015_eth = df_final[df_final['closest_year'] == 2015].merge(df_ethnicity_2015, on='town', suffixes=('', '_ethnicity'), how = 'left')

In [None]:
df_ethnicity_2020 = pd.read_csv('../data/raw/ethnicity_2020.csv')
df_ethnicity_2020['year'] = 2020
df_ethnicity_2020.rename(columns = {'Number':'town',
                                    'Chinese_Total' : 'Chinese',
                                    'Malays_Total' : 'Malays',
                                    'Indians_Total' : 'Indians',
                                    'Others_Total' : 'Others',
                                    'Total_Total' : 'Total'}
                                    ,inplace = True)
df_ethnicity_2020 = df_ethnicity_2020[['town', 'Chinese', 'Malays', 'Indians', 'Others', 'Total', 'year']]
columns_to_convert = df_ethnicity_2020.columns.difference(['town', 'Total', 'year'])
columns_to_int = df_ethnicity_2020.columns.difference(['town'])
df_ethnicity_2020.replace('-', np.nan, inplace=True).dropna(inplace=True)
df_ethnicity_2020 = df_ethnicity_2020[df_ethnicity_2020['town'].str.endswith('- Total')]
df_ethnicity_2020['town'] = df_ethnicity_2020['town'].str.replace('- Total', '').str.upper()
df_ethnicity_2020[columns_to_int] = df_ethnicity_2020[columns_to_int].astype(int)
df_ethnicity_2020[columns_to_convert] = df_ethnicity_2020[columns_to_convert].div(df_ethnicity_2020['Total'], axis=0)
df_ethnicity_2020 = df_ethnicity_2020[['Chinese', 'Malays', 'Indians', 'Others', 'town', 'year']]
df_merged_2020_eth = df_final[df_final['closest_year'] == 2020].merge(df_ethnicity_2020, on='town', suffixes=('', '_ethnicity'), how = 'left')

In [None]:
df_final = pd.concat([df_merged_2015_eth, df_merged_2020_eth], ignore_index=True)
df_final.drop(columns=['year', 'Total', 'closest_year'], inplace=True)
df_final

## Residential Unemployment

In [None]:
# Choose relevant rows
df_unemployment = pd.read_csv("../data/raw/unemployment.csv")
df_unemployment = df_unemployment.iloc[9:12]
df_unemployment.columns = df_unemployment.iloc[0]

# Pivot the columns
df_unemployment = df_unemployment[1:]
df_unemployment = df_unemployment.melt(id_vars=["Data Series"], var_name="year", value_name="unemployment_rate")
df_unemployment["year"] = df_unemployment["year"].astype(int)
df_unemployment = df_unemployment.pivot(index="year", columns="Data Series", values="unemployment_rate").reset_index()
df_unemployment.columns = ["year", "resident_unemployment_rate"]

# Join with resale dataframe
df_final["year"] = df_final["year"].astype(int)
df_final = df_final.merge(df_unemployment, on="year", how="left")

## Average Household Income

In [None]:
income_2015 = pd.read_csv("../data/raw/Household_income_2015.csv")
income_2020 = pd.read_csv("../data/raw/Household_income_2020.csv")

income_2015.rename(columns={'Thousands': 'town'}, inplace=True)
income_2020.rename(columns={'Number': 'town'}, inplace=True)
income_2015['town'] = income_2015['town'].str.upper()
income_2020['town'] = income_2015['town'].str.upper()

Note that there are some discrepancies between towns in resale data and household income data. To be specific,

- `TANGLIN, NOVENA, OTHERS` (in income data) are not in resale data. We will drop them.
- `CENTRAL AREA (in resale data)` is missing from income data (but we are mapping `OUTRAM` to it).
- `KALLANG/WHAMPOA` (in resale data) is missing from income data (but we are mapping `KALLANG` to it).


In [None]:
region_mapping = {
    'KALLANG': 'KALLANG/WHAMPOA',
    'OUTRAM': 'CENTRAL AREA'
}

income_2015['town'] = income_2015['town'].replace(region_mapping)
income_2020['town'] = income_2020['town'].replace(region_mapping)


resale_regions = list(df_resale['town'].unique())
income_2015 = income_2015[income_2015['town'].isin(resale_regions)]
income_2020 = income_2020[income_2020['town'].isin(resale_regions)]

midpoints = {
    'Below_1_000': 500,
    '1_000_1_499': 1250,
    '1_500_1_999': 1750,
    '2_000_2_499': 2250,
    '2_500_2_999': 2750,
    '3_000_3_999': 3500,
    '4_000_4_999': 4500,
    '5_000_5_999': 5500,
    '6_000_6_999': 6500,
    '7_000_7_999': 7500,
    '8_000_8_999': 8500,
    '9_000_9_999': 9500,
    '10_000_10_999': 10500,
    '11_000_11_999': 11500,
    '12_000andOver': 13000 
}

def estimate_average_income(row):
    total_pop = row['Total'] 
    sum = 0
    for bracket, midpoint in midpoints.items():
        bracket_pop = row.get(bracket, 0)
        sum += bracket_pop * midpoint
    return sum / total_pop

income_2015['household_income_2015'] = income_2015.apply(estimate_average_income, axis=1)
income_2020['household_income_2020'] = income_2020.apply(estimate_average_income, axis=1)

income_2015 = income_2015[['town', 'household_income_2015']]
income_2020 = income_2020[['town', 'household_income_2020']]
df_household_income = income_2015.merge(income_2020, on='town', how='left')
df_household_income = df_household_income.melt(
    id_vars=['town'], 
    value_vars=['household_income_2015', 'household_income_2020'],
    var_name='year',
    value_name='avg_household_income'
)

df_household_income['year'] = df_household_income['year'].map({
    'household_income_2015': 2015,
    'household_income_2020': 2020
})

Join `df_household_income` with `df_resale` using town and year. We can map

- 2015-2019 in resale price → 2015 in household income
- 2020 and beyond in resale price → 2020 in household income

In [None]:
df_final['income_year'] = df_final['year'].apply(lambda x: 2015 if x <= 2019 else 2020)
df_final = df_final.merge(
    df_household_income, 
    left_on=['town', 'income_year'], 
    right_on=['town', 'year'], 
    how='left'
).drop(columns=['year'])

In [None]:
df_final.to_csv('../data/cleaned/resale_price_cleaned_4.csv', index=False)