In [1]:
import numpy as np 
import pandas as pd
import datetime
from geopy.distance import distance

In [2]:
# load data from csv

data_directory = 'data/'

q3_township_csv = data_directory + 'q3_townships.csv'
q3_transaction_csv = data_directory + 'q3_transactions.csv'
q3_poi_csv = data_directory + 'q3_pois.csv'

township_csv = data_directory + 'q4_townships.csv'
transaction_csv = data_directory + 'q4_transactions.csv'
poi_csv = data_directory + 'q4_pois.csv'

townships = pd.read_csv(q3_township_csv, na_values=['None'], index_col=0)
transactions = pd.read_csv(q3_transaction_csv, na_values=['None'], index_col=0)
pois = pd.read_csv(q3_poi_csv, na_values=['None'], index_col=0)


In [3]:
townships

Unnamed: 0,project_id,latitude,longitude,project_name,state,area,street_name,median_psf,median_price
0,10113,3.200030,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000
1,16814,3.087248,101.721283,Bandar Tun Razak (Kampung Konggo),Kuala Lumpur,Bandar Tun Razak,JALAN JUJUR 3,294,396667
2,16319,3.181050,101.672996,Sri Putramas I,Kuala Lumpur,Dutamas,JALAN PUTRAMAS 1,394,432667
3,17003,3.149142,101.624763,Taman Tun Dr Ismail (TTDI),Kuala Lumpur,Taman Tun Dr Ismail,LENGKOK AMINUDDIN BAKI,814,1541667
4,25133,3.192106,101.639056,Bandar Menjalara (Desa Seri Mahkota),Kuala Lumpur,Bandar Menjalara,JALAN 6/62,512,635000
...,...,...,...,...,...,...,...,...,...
127,13600,3.060640,101.585999,USJ 2,Selangor,USJ,JALAN USJ 2/4E,432,675000
128,13603,3.024680,101.584999,USJ 22,Selangor,USJ,JALAN USJ 22/2B,343,850000
129,13606,3.056050,101.571999,USJ 3,Selangor,USJ,JALAN USJ 3/4B,619,740000
130,13632,3.080259,101.774374,Venice Hill,Selangor,Batu 9th Cheras,PERSIARAN PUTERI 1,78,120000


In [4]:
pois

Unnamed: 0,name,category,latitude,longitude
0,Sekolah Menengah Kebangsaan Kepong Baru,education,3.197770,101.647000
1,Sekolah Kebangsaan Taman Kepong,education,3.198950,101.648000
2,Sekolah Kebangsaan Kepong Baru,education,3.205899,101.645528
3,Sekolah Kebangsaan Menjalara,education,3.193969,101.637578
4,Sekolah Menengah Kebangsaan Taman Bukit Maluri,education,3.202450,101.634000
...,...,...,...,...
5358,Sekolah Kebangsaan Seksyen 27 (2),education,3.016785,101.570449
5359,Sekolah Menengah Kebangsaan Seksyen 27,education,3.021596,101.566105
5361,Sekolah Kebangsaan Seksyen 27(1),education,3.020541,101.565389
5362,Sekolah Menengah Kebangsaan Alam Megah,education,3.012403,101.568793


In [5]:
transactions

Unnamed: 0,project_id,date,property_type,area_sqft,non_landed,bedrooms,street_name,psf,price,planning_region
0,10113,1576454400,Terrace House,1970,0,3.0,JALAN HELANG,310,610000,Kepong
1,10113,1576195200,Terrace House,1539,0,3.0,LORONG LANG HITAM 6,468,720000,Kepong
2,10113,1576108800,Terrace House,1539,0,3.0,JALAN LANG KEPALA PUTIH,435,670000,Kepong
3,10113,1575936000,Terrace House,3272,0,3.0,JALAN CHIAK RAYA,336,1100000,Kepong
4,16814,1575936000,Cluster House,990,0,2.0,JALAN JUJUR 3,283,280000,Bandar Tun Razak
...,...,...,...,...,...,...,...,...,...,...
161,13592,1575936000,Terrace House,3337,0,3.0,JALAN USJ 11/7,330,1100000,USJ
162,13600,1575244800,Terrace House,1561,0,3.0,JALAN USJ 2/4E,432,675000,USJ
163,13603,1575763200,Terrace House,2476,0,3.0,JALAN USJ 22/2B,343,850000,USJ
164,13606,1575849600,Terrace House,1195,0,3.0,JALAN USJ 3/4B,619,740000,USJ


## Integration

In [6]:
pois['category'].value_counts()

education         961
transportation    193
healthcare        101
Name: category, dtype: int64

## Add new columns nearby_poi_education, nearby_poi_transportation and nearby_poi_healthcare to township dataset


In [7]:
# create new column to pois, coordinates as a tuple of latitue & longitude
pois['coordinates'] = list(zip(round(pois['latitude'], 4), round(pois['longitude'], 4)))

# calculate distance between two points
def calc_distance(source, target):
    return(distance(source, target).m)


# find nearby township psf
def nearby_poi(target_coordinate):
    # add new column to record distance of POI to target coordinate
    pois['distance'] = pois['coordinates'].apply(
        lambda x: 
        calc_distance(x, target_coordinate)
    )

    # compute summary of total POI counts within 3km for each category
    nearby_pois_count = (
        pois[(pois['distance'] <= 3000) & (pois['category'] == 'education')].count()['name'],
        pois[(pois['distance'] <= 3000) & (pois['category'] == 'transportation')].count()['name'],
        pois[(pois['distance'] <= 3000) & (pois['category'] == 'healthcare')].count()['name']
    )

    # drop the distance column
    pois.drop(['distance'], axis=1, inplace=True)

    print('.', end='')
    
    return (nearby_pois_count)




In [8]:
# create new column to townships, coordinates as a tuple of latitue & longitude
townships['coordinates'] = list(zip(townships['latitude'], townships['longitude']))

# add new column to Township to record near by POI count
print('Start', datetime.datetime.now())
townships['nearby_pois'] = townships.apply(
    lambda x: nearby_poi((x['coordinates'])), axis=1
)
print()
print('Completed', datetime.datetime.now())

Start 2020-05-22 23:50:48.379903
..............................................................................................................................
Completed 2020-05-22 23:51:11.229702


In [9]:
# split the tuple returned by the fuction into three new columns 
# nearby_poi_education, nearby_poi_transportation and nearby_poi_healthcare

townships[[
    'nearby_poi_education', 'nearby_poi_transportation', 'nearby_poi_healthcare'
]] = pd.DataFrame(townships['nearby_pois'].tolist(), index=townships.index)                                                                                                                       

townships.drop(['coordinates', 'nearby_pois'], axis=1, inplace=True)
townships


Unnamed: 0,project_id,latitude,longitude,project_name,state,area,street_name,median_psf,median_price,nearby_poi_education,nearby_poi_transportation,nearby_poi_healthcare
0,10113,3.200030,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000,37,10,3
1,16814,3.087248,101.721283,Bandar Tun Razak (Kampung Konggo),Kuala Lumpur,Bandar Tun Razak,JALAN JUJUR 3,294,396667,42,11,5
2,16319,3.181050,101.672996,Sri Putramas I,Kuala Lumpur,Dutamas,JALAN PUTRAMAS 1,394,432667,58,16,10
3,17003,3.149142,101.624763,Taman Tun Dr Ismail (TTDI),Kuala Lumpur,Taman Tun Dr Ismail,LENGKOK AMINUDDIN BAKI,814,1541667,35,7,3
4,25133,3.192106,101.639056,Bandar Menjalara (Desa Seri Mahkota),Kuala Lumpur,Bandar Menjalara,JALAN 6/62,512,635000,35,9,3
...,...,...,...,...,...,...,...,...,...,...,...,...
127,13600,3.060640,101.585999,USJ 2,Selangor,USJ,JALAN USJ 2/4E,432,675000,42,12,3
128,13603,3.024680,101.584999,USJ 22,Selangor,USJ,JALAN USJ 22/2B,343,850000,26,5,2
129,13606,3.056050,101.571999,USJ 3,Selangor,USJ,JALAN USJ 3/4B,619,740000,33,7,1
130,13632,3.080259,101.774374,Venice Hill,Selangor,Batu 9th Cheras,PERSIARAN PUTERI 1,78,120000,14,2,0


In [10]:
townships.to_csv(township_csv)

## Merge Township dataset into Transaction dataset


In [11]:
# Merge using inner join, transactions without township information will be ignored

transactions = transactions.merge(townships, left_on='project_id', right_on='project_id', suffixes=('', '_township'))

In [12]:
transactions

Unnamed: 0,project_id,date,property_type,area_sqft,non_landed,bedrooms,street_name,psf,price,planning_region,...,longitude,project_name,state,area,street_name_township,median_psf,median_price,nearby_poi_education,nearby_poi_transportation,nearby_poi_healthcare
0,10113,1576454400,Terrace House,1970,0,3.0,JALAN HELANG,310,610000,Kepong,...,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000,37,10,3
1,10113,1576195200,Terrace House,1539,0,3.0,LORONG LANG HITAM 6,468,720000,Kepong,...,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000,37,10,3
2,10113,1576108800,Terrace House,1539,0,3.0,JALAN LANG KEPALA PUTIH,435,670000,Kepong,...,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000,37,10,3
3,10113,1575936000,Terrace House,3272,0,3.0,JALAN CHIAK RAYA,336,1100000,Kepong,...,101.642998,Kepong Baru,Kuala Lumpur,Kepong,JALAN HELANG,388,775000,37,10,3
4,16814,1575936000,Cluster House,990,0,2.0,JALAN JUJUR 3,283,280000,Bandar Tun Razak,...,101.721283,Bandar Tun Razak (Kampung Konggo),Kuala Lumpur,Bandar Tun Razak,JALAN JUJUR 3,294,396667,42,11,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,13592,1575936000,Terrace House,3337,0,3.0,JALAN USJ 11/7,330,1100000,USJ,...,101.580002,USJ 11,Selangor,USJ,JALAN USJ 11/7,330,1100000,35,8,1
145,13600,1575244800,Terrace House,1561,0,3.0,JALAN USJ 2/4E,432,675000,USJ,...,101.585999,USJ 2,Selangor,USJ,JALAN USJ 2/4E,432,675000,42,12,3
146,13603,1575763200,Terrace House,2476,0,3.0,JALAN USJ 22/2B,343,850000,USJ,...,101.584999,USJ 22,Selangor,USJ,JALAN USJ 22/2B,343,850000,26,5,2
147,13606,1575849600,Terrace House,1195,0,3.0,JALAN USJ 3/4B,619,740000,USJ,...,101.571999,USJ 3,Selangor,USJ,JALAN USJ 3/4B,619,740000,33,7,1


In [13]:
# compare planning_region in Transaction and area in township.
# only one row produces different value, drop column planning_region

transactions[transactions['area'] != transactions['planning_region']]

Unnamed: 0,project_id,date,property_type,area_sqft,non_landed,bedrooms,street_name,psf,price,planning_region,...,longitude,project_name,state,area,street_name_township,median_psf,median_price,nearby_poi_education,nearby_poi_transportation,nearby_poi_healthcare
35,9113,1575244800,Terrace House,3466,0,4.0,JALAN DESA MELAWATI 1A,551,1910000,Wangsa Maju,...,101.739,Desa Melawati,Kuala Lumpur,Taman Melawati,JALAN DESA MELAWATI 1A,551,1910000,39,4,2


In [14]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 149 entries, 0 to 148
Data columns (total 21 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   project_id                 149 non-null    int64  
 1   date                       149 non-null    int64  
 2   property_type              149 non-null    object 
 3   area_sqft                  149 non-null    int64  
 4   non_landed                 149 non-null    int64  
 5   bedrooms                   149 non-null    float64
 6   street_name                148 non-null    object 
 7   psf                        149 non-null    int64  
 8   price                      149 non-null    int64  
 9   planning_region            149 non-null    object 
 10  latitude                   149 non-null    float64
 11  longitude                  149 non-null    float64
 12  project_name               149 non-null    object 
 13  state                      149 non-null    object 

In [15]:
# some columns are no longer useful, drop
transactions.drop([
    'date', 'project_id', 'latitude', 'longitude', 'street_name', 'project_name', 
    'psf', 'street_name_township', 'planning_region'
], axis=1, inplace=True)

In [16]:
transactions

Unnamed: 0,property_type,area_sqft,non_landed,bedrooms,price,state,area,median_psf,median_price,nearby_poi_education,nearby_poi_transportation,nearby_poi_healthcare
0,Terrace House,1970,0,3.0,610000,Kuala Lumpur,Kepong,388,775000,37,10,3
1,Terrace House,1539,0,3.0,720000,Kuala Lumpur,Kepong,388,775000,37,10,3
2,Terrace House,1539,0,3.0,670000,Kuala Lumpur,Kepong,388,775000,37,10,3
3,Terrace House,3272,0,3.0,1100000,Kuala Lumpur,Kepong,388,775000,37,10,3
4,Cluster House,990,0,2.0,280000,Kuala Lumpur,Bandar Tun Razak,294,396667,42,11,5
...,...,...,...,...,...,...,...,...,...,...,...,...
144,Terrace House,3337,0,3.0,1100000,Selangor,USJ,330,1100000,35,8,1
145,Terrace House,1561,0,3.0,675000,Selangor,USJ,432,675000,42,12,3
146,Terrace House,2476,0,3.0,850000,Selangor,USJ,343,850000,26,5,2
147,Terrace House,1195,0,3.0,740000,Selangor,USJ,619,740000,33,7,1


In [17]:
transactions.to_csv(transaction_csv)