In [None]:
# 準備工作

from bs4 import BeautifulSoup
import time
import datetime
import re
import sqlite3
import os
import json
import pprint

db_name = "../data/21_century.sqlite"
json_name = "../data/21_century.json"

In [None]:
# 從db_name取出資料

with sqlite3.connect(db_name) as conn_retrieve:
    
    c_rt = conn_retrieve.cursor()
    qryString = "SELECT * from rental;"
    c_rt.execute(qryString)
    retrieved_list = c_rt.fetchall()

c_rt.close()

# 定義方法

In [None]:
# 定義 recover_quot() 方法 (recover quotation marks)
# 將字串中連續兩個的單引號跟雙引號復原為
# 單一的單引號跟雙引號

def recover_quot(string):
    string = string.replace("\"\"","\"")
    string = string.replace("\'\'","\'")
    return string

In [None]:
# 定義 remove_dust() 方法
# 清理一個字串中所有的 \r, \n, \t, \xa0 符號
# 並將頭尾的空格去掉

def remove_dust(string):
    string = string.replace("\r","")
    string = string.replace("\n","")
    string = string.replace("\t","")
    string = string.replace("\xa0","")
    string = string.strip()
    return string

In [None]:
# 定義 remove_dirt() 方法
# 用於移除'環境介紹'中的雜物

def remove_dirt(text):
    
    replace_set = [('：',':'),('︰',':'),(', ',','),
                  ('、 ','、'),('\xa0',''),
                  ('ˊ',''),('～',''),('※',''),
                  ('查詢主頁動態',''),('特色說明','\r特色說明:'),
                  ('租金不含這些','租金不含'),('租金不含','\r租金不含:'),
                  ('備註:歡迎房客加入line或微信 ID','\r網路社群ID'),
                  ('歡迎房客加入line或微信 ID','\r網路社群ID'),
                  ('位置最大特色','\r位置最大特色'),
                  ('押金為','押金:'),('個月','個月\r解說:,'),
                  (' ','')]

    for pair in replace_set:
        text = re.sub(pair[0],pair[1],text).strip()
        
    return text

In [None]:
# 定義 remove_multi_space() 方法
# 清理多空格

def remove_multi_space(string):
    while "  " in string:
        string = string.replace("  ","")
    return string

In [None]:
# 定義 tail_pos() 方法
# 從字串中找出子字串的所有的尾端位置
# 如果找不到就傳回空的list

def tail_pos(substring, string):
    length = len(substring)
    return [pos.start() + length for pos in 
            re.finditer(substring,string)]

In [None]:
# 定義 is_col_before_break() 方法
# 判斷一個字串中分號是否在換行符號前面
# 或不存在

def is_col_before_break(string):
    try:
        first_colon_pos = string.index(':')
        try:
            first_break_pos = string.index('\n')
            return first_break_pos > first_colon_pos
        except:
            return True 
    except:
        return False

In [None]:
# 定義 desc_as_dict 方法
# 將'環境介紹'的內容拆成dictionary

def desc_as_dict(desc):

    desc_dict = {}
    
    if type(desc) != None:
                   
        if len(desc) > 0:

            # 從tag中取出文字並存放在list中
            
            list_1 = [each.text+"\r" for each in desc.select('p')]

            
            # 將各項目細分後裝到新的list中

            list_2 = []

            for each in list_1:
                each = remove_dirt(each)                # 清理
                each = re.split('\n|\r|●|◆',str(each))  # 細分內文
                for each2 in each:
                    black_list = ['',' ','\xa0','●','◆']# 排除異樣字元
                    if each2 not in black_list:
                        list_2.append(each2)            # 存入list_2

            # 將沒有引號的句子跟前一句合併

            # 先檢查前面頭幾句
            # 前面頭幾句比較可能為獨立項目 而不是同一個
            
            list_len_2 = len(list_2)
            
            if list_len_2 > 0:

                i = 0
                
                for i in range(0,list_len_2):
                    if ':' not in list_2[i]:
                        list_2[i] = "項目" + str(i) + ":" + list_2[i]
                        i += 1

            # 找關鍵字 如果找到的話就加在前面作為key值
            
            keyword_list = [('養寵物','養寵物'),
                           ('坪','坪數'),
                           ('交屋','交屋'),
                           ('開伙','開伙'),
                           ('短租','短租'),
                           ('設籍或登記','設籍或登記')]
            
            lenght_2 = len(list_2)
            
            for k in range(0,lenght_2):
                for keyword in keyword_list:
                    if ":" not in list_2[k] and keyword[0] in list_2[k]:
                        list_2[k] = keyword[1] + ":" + list_2[k]
                    
            # 接下來合併後面的句子

            list_3 = []

            while len(list_2) > 0:
                if ":" in list_2[0]:
                    list_3.append(list_2.pop(0))
                else:
                    list_3[-1] = list_3[-1] + list_2.pop(0)


            # 轉為dictionary

            for each in list_3:
                pair = each.split(":")
                desc_dict.update({pair[0]:pair[1]})

    # 傳回值

    return desc_dict

In [None]:
# 定義 get_section_data() 方法
# 從每個章節中抽出資料

def get_section_data(soup):
    
    section_data = {}
    sections = [each.text for each in soup.find_all('h3')]
    
    for section_name in sections:

        section = soup.find_all('h3',text=section_name)[0]
        section = section.find_parent()

        if not section_name == "環境介紹":

            section_key_tags = section.select('h6')
            section_keys = [each.text for each in section_key_tags]
            section_values = [each.find_next().text for each in section_key_tags]

            # 清理不必要的字串
        
            section_keys = [remove_dust(each) for each in section_keys]
            section_values = [remove_dust(each) for each in section_values]
            
        else:

            section_keys = ['環境介紹']
            desc = section.select_one('div > div')
            section_values = [desc_as_dict(desc)]
            
            if len(section_values) == 0:
                section_values = [""]
    
        
        # 塞值到 section_dict 中
        section_data.update(dict(zip(section_keys, section_values))) 
    
    # 回傳
    return section_data

In [None]:
# 定義 get_agent_data() 方法
# 從soup中取得仲介資訊

def get_agent_data(suop):

    try:
 
        agent_tags = soup.find_all('div',class_=re.compile("column staff"))
        agent_names = [each.select_one('h4').text for each in agent_tags]

        # 證號

        agent_licenses = [remove_dust(each.find('div',class_="LicenseNo").text) for each in agent_tags]

        # 電話

        agent_contact_tags = soup.select('div[name="agentContactInfo"]')
        agent_phones = []

        # 先找出每個仲介的所有電話(可能不只1個) 個別存成list
        # 但因為select後每個仍為含有tag的list
        # 所以得再將每個list裡面的tag挑出來 取出真正的電話資料後再存回

        for each_list in agent_contact_tags:
            each_agent_phones = []
            each_list = each_list.select('span[class="mobileInfo"] > a')

            for each in each_list:
                each_agent_phones.append(each.text)

            agent_phones.append(each_agent_phones)


        # email
        # 跟類似處理電話的方式處理

        agent_emails_tags = [each.find_all('a',href=re.compile("mailto:")) for each in agent_contact_tags]
        agent_emails = []

        for each_list in agent_emails_tags:
            each_agent_emails = []  

            for each in each_list:
                each_agent_emails.append(each.text)

            agent_emails.append(each_agent_emails)

        # 整合仲介資料

        agent_data = {}

        for j in range(0,len(agent_names)):

            agent_data.update(
                {agent_names[j]: {    # 代表仲介姓名
                    '證號': agent_licenses[j],
                    '電話': agent_phones[j],
                    'email': agent_emails[j]  
                }
            })

    except:
        agent_data = "NULL"
        
    finally:
        return agent_data

In [None]:
# 定義 get_landlord_data() 方法
# 從soup中取得房東資訊

def get_landlord_data(soup):

    try:        
        section = soup.find('div',class_=re.compile("column owner"))
        landlord_name = section.select_one('h3').text
        landlord_details = section.select_one('span').text

        landlord_data = {
            'name': landlord_name,
            'details': landlord_details
        }
    
    except:
        landlord_data = "NULL"
    
    finally:
        return landlord_data

In [None]:
# 定義 get_branch_data() 方法
# 從soup中取得加盟店資訊

def get_branch_data(soup):
    
    try:
        branch_tags = soup.find('div',class_=re.compile("column store"))
        branch_name = branch_tags.select_one('h4').text
        branch_co_name = branch_tags.select('ul > li')[0].text
        branch_address = branch_tags.select('ul > li')[1].text
        branch_phone = branch_tags.select('ul > li')[2].text
        branch_fax = branch_tags.select('ul > li')[3].text

        branch_data = {
            '名稱': branch_address,
            '分行公司': branch_co_name,  
            '地址': branch_address,
            '電話': branch_phone,
            '傳真': branch_fax
        }

        # 整理加盟店資訊

        branch_data = {key:remove_dust(branch_dict[key]).strip() for key in store_dict }
        
    except:
        branch_data = "NULL"
        
    finally:
        return branch_data

In [None]:
# 建立 get_data_dict 方法
# 給代表網頁內容的soup物件
# 取得每頁的資料 並以dictionary形式傳回

def get_data_dict(soup):

    # 內頁標籤1(如"租屋")
#     print(soup.select('div'))
    tag01 = soup.select('div[id="breadCrumb"] > ul > li')[1].text

    # 內頁標籤2(如"租屋列表")
    tag02 = soup.select('div[id="breadCrumb"] > ul > li')[2].text

    # 內頁標籤3(如"光復南路優質地段B1(租10)")
    tag03 = soup.select('div[id="breadCrumb"] > ul > li')[3].text

    # 仲介資訊    
    agent_data = get_agent_data(soup)

    # 房東資訊
    landlord_data = get_landlord_data(soup)

    # 加盟店
    branch_data = get_branch_data(soup)
    
    # 存放單筆資料於dictionary中
    data_dict = {
        '標籤01': tag01,
        '標籤02': tag02,
        '地點名稱': tag03,
        '仲介': agent_data,
        '房東': landlord_data,
        '分行': branch_data
    }

    # 跟其他dictionary合併
    data_dict.update(get_section_data(soup))

    # 傳回結果
    return data_dict

# 執行程序

In [None]:
# 跑迴圈 清洗資料
# 並將結果暫存到一個大的dictionary中

# index_set = range(0,10)    # 測試用
index_set = range(0,len(retrieved_list))
dict_all = {}

for i in index_set:

#     print(retrieved_list[i][0])
    
    # 將內頁內容轉成 BeautifulSoup 物件

    page_source = retrieved_list[i][1]
    page_source = recover_quot(page_source) # 還原引號
    soup = BeautifulSoup(page_source,'html.parser')

    # 網址
    # posting_id 為上架ID

    url = retrieved_list[i][0]
    posting_id = url.split("/")[-1]
   
    # 清理

    data_dict = get_data_dict(soup)
    data_dict.update({'url':url})

    # 暫存

    dict_all.update({posting_id: data_dict})

In [None]:
# 將最後的結果 dict_all 以json格式存到 json_name 路徑裡

if os.path.isfile(json_name) == False:
    json = json.dumps(dict_all)
    f = open(json_name,"w")
    f.write(json)
    f.close()