In [None]:
!pip install selenium
!apt-get update  # to update ubuntu to correctly run apt install
!apt install chromium-chromedriver
!cp / usr/lib/chromium-browser/chromedriver / usr/bin

!pip install pyspark


Web Crawler


In [13]:
import time
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
import sys
sys.path.insert(0, '/usr/lib/chromium-browser/chromedriver')


chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_prefs = {"download.default_directory": './content/drive'}
chrome_options.experimental_options["prefs"] = chrome_prefs
driver = webdriver.Chrome('chromedriver', options=chrome_options)
url = "https://plvr.land.moi.gov.tw/DownloadOpenData"


xpath_list = [
    "//select[@id='historySeason_id']/option[@value='108S2']",
    "//select[@id='fileFormatId']/option[@value='csv']",  # csv
    "//input[@id='downloadTypeId2']",  # 進階
    "//input[@value='A_lvr_land_A']",  # 台北
    "//input[@value='F_lvr_land_A']",  # 新北
    "//input[@value='H_lvr_land_A']",  # 桃園
    "//input[@value='B_lvr_land_A']",  # 台中
    "//input[@value='E_lvr_land_A']",  # 高雄
    "//input[@id='downloadBtnId']"  # 下載
]


def clawer():
    driver.get(url)
    WebDriverWait(driver, 20).until(
        EC.element_to_be_clickable((By.ID, 'ui-id-2'))).click()
    for xpath in xpath_list:
        WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, xpath))).click()
    print("Operation successful !")
    time.sleep(60)


try:
    clawer()
except Exception:
    driver.quit()


Operation successful !


In [14]:
import pandas as pd
from zipfile import ZipFile
from pyspark.sql import SparkSession, DataFrame
from functools import reduce  # For Python 3.x

zip_file = ZipFile('./content/drive/download.zip')

df_list = [
    pd.read_csv(zip_file.open('A_lvr_land_A.csv')),
    pd.read_csv(zip_file.open('F_lvr_land_A.csv')),
    pd.read_csv(zip_file.open('H_lvr_land_A.csv')),
    pd.read_csv(zip_file.open('B_lvr_land_A.csv')),
    pd.read_csv(zip_file.open('E_lvr_land_A.csv'))
]


Data clean and transform


In [16]:
def covert_num(floor_name):
    result = 0
    if isinstance(floor_name, int):
        return floor_name

    if isinstance(floor_name, float):
        result = int(floor_name)
        return result

    if(floor_name.endswith('層')):
        floor_name = floor_name[:-1]
    # List of words
    num_list = {"一": 1, "二": 2, "三": 3, "四": 4, "五": 5,
                "六": 6, "七": 7, "八": 8, "九": 9, "十": 10}

    for i in range(0, len(floor_name)):
        for k in num_list:
            if len(floor_name) == 1:
                if (k == floor_name[i]):
                    result = num_list[k]
            elif len(floor_name) == 2:
                if (k == floor_name[0]):
                    result = num_list[k] + num_list[floor_name[1]]
                else:
                    result = num_list[floor_name[0]] * 10
            elif len(floor_name) == 3:
                if (k == floor_name[1]):
                    result = num_list[floor_name[0]] * \
                        10 + num_list[floor_name[2]]
                else:
                    result = 0
        return result


def convert_western_date(date):
    if len(date) > 0:
        date = date.replace(
            date[0:3], str(int(date[0:3])+1911))
        date = date[0:4] + "-" + date[4:6] + "-" + date[6:8]
    return date


def clean_data(data_set, cityName):
    data_set = data_set.drop([0])
    data_set = data_set.fillna(0)

    trans_dict = {
        '主建物面積': str,
        '附屬建物面積': str,
        '陽台面積': str,
        '車位類別': str,
        '移轉層次': str,
        '總樓層數': str,
        '主要用途': str,
        '主要建材': str,
        '建築完成年月': str,
        '備註': str,
        '單價元平方公尺': str,
        '都市土地使用分區': str,
        '非都市土地使用分區': str,
        '非都市土地使用編定': str
    }
    # convert dataype to string or integer
    for item in trans_dict.keys():
        data_set[item] = data_set[item].astype(trans_dict[item])

    # special processing
    data_set['交易年月日'] = data_set['交易年月日'].apply(convert_western_date)
    data_set['floor_Num'] = data_set['總樓層數'].apply(covert_num)
    data_set.insert(0, 'city', cityName)

    return data_set


df = pd.concat([
    clean_data(df_list[0], "台北市"),
    clean_data(df_list[1], "新北市"),
    clean_data(df_list[2], "桃園市"),
    clean_data(df_list[3], "台中市"),
    clean_data(df_list[4], "高雄市")
],
    axis=0,
    join="outer",
    ignore_index=True)

print(df['floor_Num'])


0        33
1        13
2         0
3         4
4         5
         ..
49637     5
49638     2
49639    14
49640     2
49641     0
Name: floor_Num, Length: 49642, dtype: int64


Convert to Spark DataFrame


In [17]:
# Create PySpark SparkSession
spark = SparkSession.builder \
    .master("local[1]") \
    .appName("SparkMergeDataSet") \
    .getOrCreate()

# Convert Pandas DataFrame to Spark DataFrame
sparkdf_list = [
    spark.createDataFrame(clean_data(df_list[0], "台北市")),
    spark.createDataFrame(clean_data(df_list[1], "新北市")),
    spark.createDataFrame(clean_data(df_list[2], "桃園市")),
    spark.createDataFrame(clean_data(df_list[3], "台中市")),
    spark.createDataFrame(clean_data(df_list[4], "高雄市"))
]

spark_df = reduce(DataFrame.unionAll, sparkdf_list)

print("total data count: ", spark_df.count())


total data count:  49642


- Merge Dataframes by Pyspark
- Filter data by conditions
- Generate JSON files


In [18]:
import io
import json
from pyspark.sql.functions import desc

query_df = spark_df.where('`主要用途` == "住家用"')\
    .where('`建物型態` like "住宅大樓%"')\
    .where('floor_num >= 13')\
    .sort(desc("交易年月日"))

query_df = query_df.drop("floor_num")
pandas_df = query_df.toPandas()
records = pandas_df.values.tolist()

table = {}
for column in records:
    city = column[0]
    district = column[1]
    date = column[8]
    building_state = column[2]
    purpose = column[13]
    floors = column[12]

    if not city in table.keys():
        table[city] = {}
    if not date in table[city].keys():
        table[city][date] = []
    table[city][date].append({
        "鄉鎮市區": district,
        "建物型態": building_state,
        "主要用途": purpose,
        "總樓層數": floors
    })

result = []

for city, date_table in table.items():
    time_slots = []
    for date, events in date_table.items():
        time_slots.append({
            "date": date,
            "events": events
        })

    result.append({
        "city": city,
        "time_slots": time_slots
    })

with io.open('result-part1.json', 'w', encoding='utf-8') as f:
    for item in result[:2]:
        f.write(json.dumps(item, ensure_ascii=False, indent=2))

with io.open('result-part2.json', 'w', encoding='utf-8') as f:
    for item in result[2:]:
        f.write(json.dumps(item, ensure_ascii=False))


dataFrame save to SQLite3 for creating RESTful API


In [19]:
import sqlite3

# create db
conn = sqlite3.connect('land.db')
cursor = conn.cursor()
conn.commit()

# replace: Drop the table before inserting new values.
df.to_sql('land_txn_log', conn, if_exists='replace', index=False)
us_df = pd.read_sql("SELECT count(*) FROM land_txn_log;", conn)
print(us_df)


   count(*)
0     49642
