In [1]:
# Import relevant libraries

import json
import re
import requests
import datetime
from pathlib import Path


In [2]:
# Defining functions to be used by web crawling processes


def format_url(input_string):
    if input_string is None:
        return 'null'
    return input_string.replace(' ', '%20').strip()


def format_slug(input_string):
    if input_string is None:
        return 'null'
    return re.sub(r'\W+', '-', input_string.strip().lower())


In [3]:
# Crawl data from edgeprop by url

def crawl_url(url):
    headers = {
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 '
                      'Safari/537.36 '
    }

    request = requests.get(url, headers=headers, timeout=60)
    request.raise_for_status()
    response = request.json()

    return response



In [4]:
# Craw POIs from GraphQL API. Calling such API is more complicated as we need to include Payload and Query
def crawl_poi(latitude, longitude, category=None):
    api = 'https://raptor.rea-asia.com/v1/graphql'

    headers = {
        'accept': '*/*',
        'accept-encoding': 'gzip, deflate, br',
        'accept-language': 'en-GB',
        'content-type': 'application/json',
        'market': 'MY',
        'origin': 'https://www.iproperty.com.my',
        'sec-fetch-mode': 'cors',
        'sec-fetch-site': 'cross-site',
        'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.130 Safari/537.36',
        'x-market': 'ipropertymy'
    }

    payload = {
        'operationName': None,
        'variables': {
            'lang': 'enGB',
            'location': str(latitude) + ',' + str(longitude),
            'radius': 3000,
            'pageSize': 100,
            'category': category
        },
        'query': 'query ($lang: AcceptLanguage, $location: String!, $radius: Int, $pageSize: Int, $category: PoiCategory) {\n  pois(location: $location, radius: $radius, pageSize: $pageSize, category: $category, lang: $lang) {\n    items {\n      name\n      subTypeLabel\n      subTypeExtra\n      geometry {\n        location {\n          lat\n          lng\n          __typename\n        }\n        __typename\n      }\n      subType\n      category\n      lineName\n      placeId\n      distance\n      distanceFloat\n      completionYear\n      type\n      city\n      district\n      publicType\n      curriculumOffered\n      __typename\n    }\n    __typename\n  }\n}\n'
    }

    request = requests.post(api, headers=headers, data=json.dumps(payload), timeout=30)
    request.raise_for_status()
    response = request.json()

    return response


In [5]:
# Preparing data directory, csv files for crawled data & csv header

data_directory = 'data/'
Path(data_directory).mkdir(parents=True, exist_ok=True)

township_csv = open(data_directory + 'q1_townships.csv', 'w+')
transaction_csv = open(data_directory + 'q1_transactions.csv', 'w+')
poi_csv = open(data_directory + 'q1_pois.csv', 'w+')

print('project_id,asset_id,latitude,longitude,project_name,state,area,street_name,transaction_count,median_psf,median_price', flush=True, file = township_csv)
print('project_id,project_name,transacted_price,unit_price_psf,date,property_type,tenure,floor,area_sqft,non_landed,bedrooms,street_name,psf,price,state,planning_region', flush=True, file = transaction_csv)
print('name,sub_type_label,sub_type_extra,sub_type,category,line_name,place_id,completion_year,type,city,district,public_type,curriculum_offered,latitude,longitude', flush=True, file = poi_csv)


In [6]:
# Preparing API url template for crawling data from edgeprop

edgepro_api = 'https://www.edgeprop.my/jwdalice/api/v1/transactions/'
townships_url_template = edgepro_api + 'search?&category=RESIDENTIAL&state={}&datefrom={}&dateto={}&page={}&respp=10'
transactions_url_template = edgepro_api + 'details?&category=RESIDENTIAL&state={}&project={}&datefrom={}&dateto={}&page=1'


In [7]:
# Defining function to crawl all townships and transactions
def crawl_townships_transactions(state, date_from, date_to):
    page = 1
    
    while True:
        # Crawl townships
        townships_url = townships_url_template.format(format_url(state), date_from, date_to, str(page))
        township_list = crawl_url(townships_url)

        if township_list is None:
            break

        total_pages = township_list['totalpages']
        total_townships = township_list['total']        
        townships = township_list['property']

        if page > int(total_pages):
            break

        for township in townships:
            transaction_count = township['fieldtransactions']
            
            # Select fields to save into csv
            keys_to_extract = (
                'projectid', 'asset_id', 'lat', 'lon', 'project_name', 'state', 'area', 'street_name', 
                'fieldtransactions', 'psf', 'price'
            )
            # Slice to a new dict with only the selected fields
            d = {k: str(township[k]).replace(',', '') for k in keys_to_extract}
            
            # Write crawled data to csv
            print(','.join(map(str, d.values())), flush=True, file = township_csv)
            
            # Crawl POIs for current township
            latitude = township['lat']
            longitude = township['lon']
            poi_categories = ['education', 'healthcare', 'transportation']

            poi_list = list()
            for category in poi_categories:
                response = crawl_poi(latitude, longitude, category)
                
                if(
                    (not response['data'] is None) and 
                    (not response['data']['pois'] is None) and 
                    (not response['data']['pois']['items'] is None)
                ):
                    for poi in response['data']['pois']['items']:
                        try:
                            poi['latitude'] = poi['geometry']['location']['lat']
                            poi['longitude'] = poi['geometry']['location']['lng']
                        except:
                            poi['latitude'] = ''
                            poi['longitude'] = ''

                        poi_list.append(poi.copy())
                
            for poi in poi_list:
                # Select fields to save into csv
                keys_to_extract = (
                    'name', 'subTypeLabel', 'subTypeExtra', 'subType', 'category', 'lineName', 
                    'placeId', 'completionYear', 'type', 'city', 'district', 'publicType', 
                    'curriculumOffered', 'latitude', 'longitude'
                )
                # Slice to a new dict with only the selected fields
                d = {k: str(poi[k]).replace(',', '') for k in keys_to_extract}

                # Write crawled data to csv
                print(','.join(map(str, d.values())), flush=True, file = poi_csv)
                
            # Crawl Transactions for current township
            project = township['project_name']
            transaction_date_from = date_from
            transaction_date_to = date_to
            
            if int(transaction_count) > 0:

                while True:
                    transactions_url = transactions_url_template.format(
                        format_url(state), format_url(project), transaction_date_from, transaction_date_to
                    )
                    
                    print('.', end = '')

                    transaction_list = crawl_url(transactions_url)

                    if transaction_list is None:
                        continue

                    transactions = transaction_list['property']
                    for transaction in transactions:
                        
                        # Select fields to crawl
                        keys_to_extract = (
                            'projectid', 'project_name', 'transacted_price', 'unit_price_psf', 'date', 
                            'proptype', 'tenure', 'floor', 'area_sqft', 'non_landed', 'bedrooms', 
                            'street_name', 'psf', 'price', 'state', 'planning_region'
                        )
                        # Slice to a new dict with only the selected fields
                        d = {k: str(transaction[k]).replace(',', '') for k in keys_to_extract}

                        # Write crawled data to csv
                        print(','.join(map(str, d.values())), flush=True, file = transaction_csv)
                        
                    if len(transaction_list['property']) == 0 or transaction_list['totalpages'] == 1:
                        break

                    date_from_unixtimestamp = datetime.datetime.strptime(transaction_date_from, '%Y-%m-%d')
                    previous_date_to_unixtimestamp = datetime.datetime.strptime(transaction_date_to, '%Y-%m-%d')
                    new_date_to_unixtimestamp = datetime.datetime.utcfromtimestamp(transaction_list['property'][-1]['date'])

                    if previous_date_to_unixtimestamp == new_date_to_unixtimestamp:
                        if date_from_unixtimestamp == new_date_to_unixtimestamp:
                            break
                        else:
                            new_date_to_unixtimestamp = new_date_to_unixtimestamp - datetime.timedelta(days=1)

                    transaction_date_to = new_date_to_unixtimestamp.strftime('%Y-%m-%d')

        page = page + 1
        

In [8]:
# Main programe starts

print('Program Starts:', datetime.datetime.now())

states = ['KUALA LUMPUR', 'SELANGOR', 'PUTRAJAYA']
date_from = '2019-12-01'
date_to = '2019-12-31'
print('Crawling "townships", "pois", "transactions" from', date_from, 'to', date_to, 'for:')
for state in states:
    print(state, end = ' ')
    crawl_townships_transactions(state, date_from, date_to)        
    print()

print('Program Ends:', datetime.datetime.now())

Program Starts: 2020-05-22 22:59:18.269819
Crawling "townships", "pois", "transactions" from 2019-12-01 to 2019-12-31 for:
KUALA LUMPUR ..............................................................................
SELANGOR .....................................................
PUTRAJAYA .
Program Ends: 2020-05-22 23:05:44.392090
