# Part 1 - Data Cleaning

In [None]:
# import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup as bs
import json
import requests
%matplotlib inline

In [None]:
# import data
housing_df=pd.read_csv("../datasets/train.csv")
housing_df.head()

We observe column 42 (postal) has mixed data types. Here we standardise the datatype in this column by setting as string.

In [None]:
housing_df.columns[42]

In [None]:
# convert postal to string
housing_df['postal']=housing_df['postal'].map(lambda x: str(x))

In [None]:
housing_df.shape

In [None]:
housing_df.info()

In [None]:
#rename all columns using lower caps
housing_df.columns=housing_df.columns.str.strip().str.lower() 

In [None]:
# rename to sec sch afiliation

housing_df.rename(columns={'affiliation':'sec_sch_affiliation'},inplace=True)

In [None]:
# check for nulls in each column
housing_df[housing_df.columns[housing_df.isnull().any()]].isnull().sum()

Out of all 76 columns in dataset, we have 7 columns containing nulls as shown above.

### Accounting for null values in hawker quantity columns

As there are no missing values for hawker nearest distance, a possibility might be that the hawker nearest distance in specific rows has exceeded 500m/1km/2km, hence resulting in hawker null values. This can be easily validated by finding the minimum distance for specific rows containing nulls, as shown below.

In [None]:
#check if there is any discrepancy in minimum distance larger than 500m/1km/2km for rows with missing hawker quantity

print(f"Minimium distance greater than 500m: \
{round(housing_df[housing_df['hawker_within_500m'].isnull()]['hawker_nearest_distance'].min(),3)}m")

print(f"Minimium distance greater than 1km: \
{round(housing_df[housing_df['hawker_within_1km'].isnull()]['hawker_nearest_distance'].min(),3)}m")

print(f"Minimium distance greater than 2km: \
{round(housing_df[housing_df['hawker_within_2km'].isnull()]['hawker_nearest_distance'].min(),3)}m")

For the rows with missing hawker value, we see that the available minimum distance greater than 500m, 1km and 2km is 500.209m, 1000.074m and 2000.872m respectively. This implies that the nulls in each of these 3 columns can be set to 0, since the minimum distance present is larger than the specified distance.

In [None]:
# replace all missing hawker value to 0
housing_df.loc[housing_df['hawker_within_500m'].isnull(),'hawker_within_500m']=0
housing_df.loc[housing_df['hawker_within_1km'].isnull(),'hawker_within_1km']=0
housing_df.loc[housing_df['hawker_within_2km'].isnull(),'hawker_within_2km']=0

### Accounting for null values in mall quantity columns

As we have 829 rows with missing mall nearest distance, firstly we need to address the nulls in this column before filling in the missing mall quantity values. The procedure to find the nearest mall distance in these rows is shown below.

1. Retrieve all malls in Singapore, this can be done by webscraping from Wikipedia.
2. Use OneMap API to locate the latitude/longitude of the malls.
3. For each housing unit, calculate distance between itself and each mall using Haversine formula. Locate the minimum mall distance and replace the null value.

An important point to note is that the rows with missing mall distance are identified through their row index in the dataset.

In [None]:
# Retrieve all malls in Singapore from Wikipedia
url='https://en.wikipedia.org/wiki/List_of_shopping_malls_in_Singapore'
response = requests.get(url)
print('Status Code: ',response.status_code)
html = response.content
soup = bs(html, 'lxml')
soup

malls = soup.find_all("div",{'class':'div-col'})
list_of_malls=[]
for i in range(len(malls)):
    list_of_malls+=[x.text for x in malls[i].find_all("li")]
list_of_malls=[x[:x.find("[")] if x.find("[")!=-1 else x for x in list_of_malls]
list_of_malls

In [None]:
# Obtaining Mall Coordinates in Singapore from OneMAP API
list_of_malls
mall_name = []
mall_lat = []
mall_long = []

for i in range(0, len(list_of_malls)):
    query_address = list_of_malls[i]
    query_string = 'https://developers.onemap.sg/commonapi/search?searchVal='+str(query_address)+'&returnGeom=Y&getAddrDetails=Y'
    resp = requests.get(query_string)
    data_mall=json.loads(resp.content)
    
    if data_mall['found'] != 0:
        mall_name.append(query_address)
        mall_lat.append(float(data_mall["results"][0]["LATITUDE"]))
        mall_long.append(float(data_mall["results"][0]["LONGITUDE"]))

        print (str(query_address)+", Latitude: "+data_mall['results'][0]['LATITUDE'] +" Longitude: "+data_mall['results'][0]['LONGITUDE'])

    else:
        print (f"No result for {list_of_malls[i]}")

We observe 9 malls that were unable to retrieve latitude and longitude values automatically, this missing latitude/longitude information will be added manually by looking up online or refining the search term in OneMAP API.

In [None]:
#add in latitude and longitude manually as search criteria not exactly matching between Wikipedia and OneMAP API
missing_malls={
    'Mall': ['Clarke Quay Central','City Gate Mall','Holland Village Shopping Mall','Mustafa Shopping Centre','PoMo','Shaw House and Centre','KINEX','Paya Lebar Quarter (PLQ)','OD Mall'],
    'Latitude': [1.2882413,1.30231590504573,1.31027747574118,1.31011213190394,1.300058,1.305512,1.31479,1.317369,1.3380],
    'Longitude': [103.846430401652,103.862331661034,103.795371163103,103.855290873926,103.849079,103.831755,103.89464,103.893266,103.7934]
}

mall_name += missing_malls['Mall']
mall_lat += missing_malls['Latitude']
mall_long += missing_malls['Longitude']

In [None]:
#calculate distance between 2 points using Haversine formula

from math import radians, cos, sin, asin, sqrt
def dist(lat1, lon1, lat2, lon2):
     
    # The math module contains a function named radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a))
    
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
      
    # calculate the result
    return(c *r*1000)

In [None]:
missing_lat=[x[43] for x in housing_df[housing_df['mall_nearest_distance'].isnull()].values]
missing_long=[x[44] for x in housing_df[housing_df['mall_nearest_distance'].isnull()].values]

In [None]:
row_index=housing_df[housing_df['mall_nearest_distance'].isnull()]['mall_nearest_distance'].index

In [None]:
#row index where mall nearest distance is null
print(list(row_index))

In [None]:
def mall_nearest_dist(latitude,longitude):
    distance=[]
    for i in range(len(latitude)):
        x=[]
        for j in range(len(mall_name)):
            x.append(dist(latitude[i],longitude[i],mall_lat[j],mall_long[j]))
        distance.append(pd.DataFrame(x).min().values[0])
    return distance

distance=pd.DataFrame(mall_nearest_dist(missing_lat,missing_long)).rename(columns={0:'mall_nearest_distance'})
distance.rename(index={x:row_index[x] for x in range(len(distance))},inplace=True)

def mall_dist(x):
    if pd.isna(housing_df['mall_nearest_distance'][x]):
        return distance['mall_nearest_distance'][x]
    else:
        pass

In [None]:
distance

In [None]:
# if null detected in mall nearest distance, it will be replaced by a value from the distance dataframe (with matching index, 
# since the row index with nulls has already been established)
for i in range(len(housing_df)):
    if i in row_index.values:
        housing_df.loc[i,'mall_nearest_distance']=mall_dist(i)

In [None]:
housing_df.loc[row_index,['mall_nearest_distance','mall_within_500m','mall_within_1km','mall_within_2km']].isnull().sum()

These 829 rows already have a nearest mall distance, as expected they do not have a mall quantity within 500m/1km/2km. We proceed to calculate the 3 values for each row.

In [None]:
def mall_500m(latitude,longitude):
    n_malls_500m=[]
    for i in range(len(latitude)):
        x=[]
        for j in range(len(mall_name)):
            x.append(dist(latitude[i],longitude[i],mall_lat[j],mall_long[j]))
        x=pd.DataFrame(x).rename(columns={0:'dist'})
        n_malls_500m.append(x[x['dist']<=500].count().values[0])
    return n_malls_500m

n_malls_500m=pd.DataFrame(mall_500m(missing_lat,missing_long)).rename(columns={0:'mall_within_500m'})
n_malls_500m.rename(index={x:row_index[x] for x in range(len(distance))},inplace=True)

In [None]:
n_malls_500m

In [None]:
# calculate the number of malls within 500m if the row index previously had a null value for mall_nearest_distance
for i in range(len(housing_df)):
    if i in row_index.values:
        housing_df.loc[i,'mall_within_500m']=n_malls_500m['mall_within_500m'][i]

In [None]:
def mall_1km(latitude,longitude):
    n_malls_1km=[]
    for i in range(len(latitude)):
        x=[]
        for j in range(len(mall_name)):
            x.append(dist(latitude[i],longitude[i],mall_lat[j],mall_long[j]))
        x=pd.DataFrame(x).rename(columns={0:'dist'})
        n_malls_1km.append(x[x['dist']<=1000].count().values[0])
    return n_malls_1km

n_malls_1km=pd.DataFrame(mall_1km(missing_lat,missing_long)).rename(columns={0:'mall_within_1km'})
n_malls_1km.rename(index={x:row_index[x] for x in range(len(distance))},inplace=True)

In [None]:
n_malls_1km

In [None]:
for i in range(len(housing_df)):
    if i in row_index.values:
        housing_df.loc[i,'mall_within_1km']=n_malls_1km['mall_within_1km'][i]

In [None]:
def mall_2km(latitude,longitude):
    n_malls_2km=[]
    for i in range(len(latitude)):
        x=[]
        for j in range(len(mall_name)):
            x.append(dist(latitude[i],longitude[i],mall_lat[j],mall_long[j]))
        x=pd.DataFrame(x).rename(columns={0:'dist'})
        n_malls_2km.append(x[x['dist']<=2000].count().values[0])
    return n_malls_2km

n_malls_2km=pd.DataFrame(mall_2km(missing_lat,missing_long)).rename(columns={0:'mall_within_2km'})
n_malls_2km.rename(index={x:row_index[x] for x in range(len(distance))},inplace=True)

In [None]:
n_malls_2km

In [None]:
for i in range(len(housing_df)):
    if i in row_index.values:
        housing_df.loc[i,'mall_within_2km']=n_malls_2km['mall_within_2km'][i]

In [None]:
housing_df.loc[:,['mall_nearest_distance','mall_within_500m','mall_within_1km','mall_within_2km']].isnull().sum()

As there are no missing values for mall nearest distance, it is highly likely that the mall nearest distance in specific rows has exceeded 500m/1km/2km, hence resulting in mall null values. This can be easily validated by finding the minimum distance for specific rows containing nulls, as shown below.

In [None]:
#check if there is any discrepancy in minimum mall distance larger than 500m/1km/2km for rows with missing mall count
print(f"Minimium mall distance greater than 500m: \
{round(housing_df[housing_df['mall_within_500m'].isnull()]['mall_nearest_distance'].min(),3)}m")

print(f"Minimium mall distance greater than 1km: \
{round(housing_df[housing_df['mall_within_1km'].isnull()]['mall_nearest_distance'].min(),3)}m")

print(f"Minimium mall distance greater than 2km: \
{round(housing_df[housing_df['mall_within_2km'].isnull()]['mall_nearest_distance'].min(),3)}m")

For the rows with missing mall value, we see that the available minimum distance greater than 500m, 1km and 2km is 500.056m, 1000.023m and 2000.176m respectively. This implies that the nulls in each of these 3 columns can be set to 0, since the minimum distance present is larger than the specified distance.

In [None]:
housing_df.loc[housing_df['mall_within_500m'].isnull(),'mall_within_500m']=0
housing_df.loc[housing_df['mall_within_1km'].isnull(),'mall_within_1km']=0
housing_df.loc[housing_df['mall_within_2km'].isnull(),'mall_within_2km']=0

Lastly, we convert the values in specific columns to appropriate data types. This is done at the very end as there were nulls in some of the columns and datatype conversion was not possible.

In [None]:
#change specific columns in dataset to appropriate data types
housing_df['id']=housing_df['id'].map(lambda x: str(x))
housing_df['mall_within_500m']=housing_df['mall_within_500m'].map(lambda x: int(x))
housing_df['mall_within_1km']=housing_df['mall_within_1km'].map(lambda x: int(x))
housing_df['mall_within_2km']=housing_df['mall_within_2km'].map(lambda x: int(x))
housing_df['hawker_within_500m']=housing_df['hawker_within_500m'].map(lambda x: int(x))
housing_df['hawker_within_1km']=housing_df['hawker_within_1km'].map(lambda x: int(x))
housing_df['hawker_within_2km']=housing_df['hawker_within_2km'].map(lambda x: int(x))

In [None]:
# to verify if there are any missing values in each column
housing_df.isnull().sum().sum()

In [None]:
# to check if there are any identical rows
housing_df.duplicated().sum()

We have verified that the housing dataset does not contain any nulls and no duplicated rows are seen as well.

In [None]:
#save and export data for further analysis
housing_df.to_csv("../datasets/cleaned_housing_data.csv",index=False)