# 準備工作

In [1]:
# 準備工作

from bs4 import BeautifulSoup
import time
import datetime
import re
import sqlite3
import os
import json
import csv
import pprint

data_source = "../data/21_century.json"
output_file_path = "../data/21_century.csv"

In [2]:
# 讀取全臺縣市區域街道查詢表

addr_source = "../data/全台鄉鎮市區街道列表.csv"

with open(addr_source) as addr_file:    
    addr_ref = csv.reader(addr_file)
    addr_ref = [tuple(row) for row in addr_ref]

In [18]:
# 檢視全台鄉鎮市區街道列表
# pprint.pprint(addr_ref)

# 定義方法

In [4]:
def sect_format(string):
    final_string = string
    before_list = ['1段','2段','3段','4段','5段','6段','7段','8段','9段','台']
    after_list = ['一段','二段','三段','四段','五段','六段','七段','八段','九段','臺']
    
    bef_len = len(before_list)
    aft_len = len(after_list)
    
    if bef_len == aft_len:
        for i in range(0,bef_len):
            final_string = final_string.replace(before_list[i],after_list[i])
    else:
        print("before_list 長度與 after_list 長度不同, 請檢查")
            
    return final_string

In [5]:
def which_in_list(addr,addr_ref):
#     i = 0
#     end_i = len(addr)
    final_addr = ""
    for addr_tuple in addr_ref:
        
        city_distr = addr_tuple[0] + addr_tuple[1]
        
        if addr_tuple[1] + addr_tuple[2] in addr:
            final_addr = "".join(addr_tuple)
            break
        elif city_distr in addr:
            if len(city_distr) > len(final_addr):
                final_addr = addr_tuple[0] + addr_tuple[1]
                
    return final_addr

In [6]:
def findall_and_glue(regex,string):
    final_string = ""
    try:
        final_list = re.findall(regex,test_string)
        for each in final_list:
            final_string += each
        return final_string
    except:
        return ""

In [7]:
def clean_address(addr):

    addr = sect_format(addr)
    final_addr = ""
    final_addr += which_in_list(addr,addr_ref)
    final_addr += findall_and_glue('\d+巷|\d+弄|\d+鄰|\d+號|\d+樓',addr)

    return final_addr

In [8]:
# 定義 permits() 方法
# 判斷住所是否與許item_string中指定的物件(如寵物等)
# main_data_all為包含單一住所全部資料的dictionary

def permits(main_data_all, item_string):

    permit = 'NULL'
    re_no_criteria = '(禁止\w*?' + item_string + '|不可\w*?' + item_string + ')'
    re_yes_criteria = '(開放\w*?' + item_string + '|可\w*?' + item_string + ')'
    
    for each in main_data_all:
        try:
            desc_value = str("".join(main_data_all[each]))
        except:
            desc_value = str(main_data_all[each])
            
        if item_string in each or item_string in desc_value:
            if re.search(re_no_criteria, each + " " + desc_value):
                permit = "N"
            elif re.search(re_yes_criteria, each + " " + desc_value):
                permit = "Y"
    
    return permit

In [9]:
# 定義 has() 方法
# 判斷住所是否含有item_string中指定的物件(如停車場等)
# main_data_all為包含單一住所全部資料的dictionary

def has(main_data_all,item_string):
    
    permit = 'NULL'
    re_no_criteria = '(沒有\w*?' + item_string + '|無\w*?' + item_string + ')'
    re_yes_criteria = '(有\w*?' + item_string + '|含\w*?' + item_string + ')'
    
    for each in main_data_all:
        
        try:
            desc_value = str("".join(main_data_all[each]))
        except:
            desc_value = str(main_data_all[each])
        
        if item_string in each or item_string in desc_value:
            if re.findall(re_no_criteria, each + " " + desc_value):
                permit = "N"
            elif re.findall(re_yes_criteria, each + " " + desc_value):
                permit = "Y"
                
    return permit

In [10]:
# get_filtered_table() 方法
# 從main_data資料集中取出所有MySQL所需的資料
# 並以dictionary格式傳回

def get_filtered_table(main_data):
    
    filtered_table = [[
        'url',
        'title',
        'address',
        'pattern',  
        'floor',
        'label',
        'rent',
        'lat',
        'lng',
        'sex',
        'space',
        'smoke',
        'pet',
        'cook',
        'parking',
        'updateDate'
    ]]

    
    # Loop
    
    for each_id in main_data:

        main_data_all = main_data[each_id]

        # url

        url = main_data_all['url']

        
        # updateDate
        
        updateDate = main_data_all['爬蟲日期時間']
        
        
        # title
        
        title = main_data_all['地點名稱']

        
        # address
        
        address = clean_address(main_data_all['地址'])
        
        
        # 格局
        try:
            格局 = main_data_all['環境介紹']['格局']
        except:
            格局 = 'NULL'
        
        
        # floor
        
        try:
            filtered = re.findall('^\d+(?=樓)',main_data_all['樓層'])
            if len(filtered) == 0:
                floor = 0
            else:
                floor = int(filtered[0])
        except:
            floor = 0
        
        
        # ttype       

        try:
            ttype = main_data_all['種類']            
            if ttype in ['公寓','平房','套房','大廈','華廈','別墅','透天厝','車位','其他']:
                ttype = "H"
            elif ttype in ['店面','店辦住','透天店面']:
                ttype = "S"
            elif ttype in ['辦公','辦住']:
                ttype = "O"
            elif ttype in ['廠房','農地','土地','建地']:
                ttype = "F"
        except:
            print(each_id + " 沒有種類相關資料")
            ttype = "NULL"


        # avg_rent

        avg_rent = main_data_all['租金']
        avg_rent = re.search('(\d|\,)+',avg_rent)
        avg_rent = avg_rent.group(0)
        avg_rent = avg_rent.replace(",","")
        avg_rent = int(avg_rent)


        # lat
        
        lat = float(main_data_all['緯度'])
        
        
        # lng
        
        lng = float(main_data_all['經度'])
        
        
        # sex

        sex_res = main_data_all['性別限制']
        try:
            if '僅男生' in [sex_res]:
                sex = "M"
            elif '僅女生' in [sex_res]:
                sex = "F"
            elif '男女皆可' in [sex_res]:
                sex = "B"
            else:
                sex = "NULL"
        except:
            sex = "NULL"


        # space    

        try:
            space = main_data_all['坪數']
            space = re.search('^(\d|\,|\.)*',space)
            space = space.group(0)
        except:
            space = "NULL"


        # other info

        smoke = permits(main_data_all,'抽煙')
        pet = permits(main_data_all,'寵物')
        cook = permits(main_data_all,'開伙')
        parking = has(main_data_all,'車位')

    
        # insert values

        filtered_table.append([
            url,
            title,
            address,
            格局,  
            floor,
            ttype,
            avg_rent,
            lat,
            lng,
            sex,
            space,
            smoke,
            pet,
            cook,
            parking,
            updateDate
        ])
    
    return filtered_table

# 執行程式

In [11]:
# 讀取已存好的資料
try:
    with open(data_source,encoding="utf-8") as data_file:    
        main_data = json.load(data_file)
except:
    with open(data_source,encoding="Big5") as data_file:    
        main_data = json.load(data_file)

In [16]:
# 檢視main_data
# pprint.pprint(main_data)

In [13]:
filtered_table = get_filtered_table(main_data)

15486 沒有種類相關資料


In [17]:
# 檢視get_filtered_table()結果
# pprint.pprint(filtered_table)

In [15]:
# 將最後的結果 dict_all 以json格式存到 json_name 路徑裡

with open(output_file_path,'wt') as dest_file:    
    output = csv.writer(dest_file, delimiter=',')
    output.writerows(filtered_table)

print("CSV file saved.")

CSV file saved.
