In [1]:
# 準備工作

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
import time
import datetime
import re
import sqlite3
import os
import pprint

db_name = "../data/MRT_Taipei.sqlite"
db_url_list = "../data/MRT_Taipei_URLs.sqlite"

In [2]:
# 建立紀錄已經看過的內頁url的資料庫
# 如果這個資料庫已經被建立過了 就略過

# url: 內網url中的ID 具primary key性質
# modDate: 檢視內頁的日期跟時間

if os.path.isfile(db_url_list) == False:

    with sqlite3.connect(db_url_list) as conn_id_list:
        
        c = conn_id_list.cursor()
        
        c.execute("""CREATE TABLE urlList(
        url text unique not null,
        modDate datetime 
        )""")

    conn_id_list.close()

In [3]:
# 建立儲存內頁url跟內頁內容用的資料庫
# 如果這個資料庫已經被建立過了 就略過

# url: 內網url 具primary key性質
# soup: 內頁內容
# nosql: 用於紀錄資料紀錄於nosql資料庫的狀態
# rdb: 用於紀錄資料紀錄於sql資料庫的狀態
# getTime: 紀錄內頁的日期跟時間

if os.path.isfile(db_name) == False:

    with sqlite3.connect(db_name) as conn:
        
        c = conn.cursor()
        
        c.execute("""CREATE TABLE mrtList(
        url text unique not null,
        soup text not null,
        nosql text,
        rdb text,
        getTime datetime 
        )""")
        
    conn.close()

In [4]:
# 定義 remove_dust() 方法
# 移除多餘的字串

def remove_dust(string):
    string = re.sub(" |\n|\r|\xa0","",string)
    return string

In [5]:
# 

main_url = "http://www.metro.taipei/ct.asp?xItem=78479152&CtNode=70089&mp=122035"
page_source = requests.get(main_url)
soup = BeautifulSoup(page_source.text,'html.parser')

In [6]:
pprint.pprint(soup)

<?xml version="1.0"  encoding="utf-8" ?>
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<!--<meta http-equiv="X-UA-Compatible" content="IE=EmulateIE7"/>-->
<script language="javascript"> 
//S***解決空白頁的產生***
	DeleteCookie('__utma');
	DeleteCookie('__utmb');
	DeleteCookie('__utmc');
	DeleteCookie('__utmz');
	DeleteCookie('__utmt');
	DeleteCookie('_ga');
	 
function DeleteCookie(name) {
var date=new Date();
var strDomain =window.location.hostname;
strDomain =strDomain.replace('WWW','').replace('www',''); 
strDomain2 =".gov.taipei";
 date.setTime(date.getTime() + (-1 * 24 * 60 * 60 * 1000));
    document.cookie = name + "=;expire=" + date.toUTCString() + ";path=/;domain=" + strDomain + "; secure=;";
	document.cookie = name + "=;expire=" + date.toUTCString() + ";path=/;domain=" + strDomain2 + "; secure=;";
} 
//E***解決空白頁的產生***
</script>
<html lang="zh-TW" xml:lang="zh-TW" xmlns="http://www.w3.org/1

In [7]:
# 取外網資料

# 台北捷運
# 不包括桃園捷運

soup_stns = soup.find_all('option')
list_stn_names = [re.split(" ",each.text)[1] for each in soup_stns]
list_stn_IDs = [re.split(" ",each.text)[0] for each in soup_stns]
list_stn_web_IDs = [each['value'] for each in soup_stns]


In [8]:
# 資料集合

dict_stns = tuple(zip(list_stn_IDs,list_stn_web_IDs,list_stn_names))
dict_stns

(('BR01', 'BR01-019', '動物園'),
 ('BR02', 'BR02-018', '木柵'),
 ('BR03', 'BR03-017', '萬芳社區'),
 ('BR04', 'BR04-016', '萬芳醫院'),
 ('BR05', 'BR05-015', '辛亥'),
 ('BR06', 'BR06-014', '麟光'),
 ('BR07', 'BR07-013', '六張犁'),
 ('BR08', 'BR08-012', '科技大樓'),
 ('BR09', 'BR09-011', '大安'),
 ('BR10', 'BR10-010', '忠孝復興'),
 ('BR11', 'BR11-009', '南京復興'),
 ('BR12', 'BR12-008', '中山國中'),
 ('BR13', 'BR13-007', '松山機場'),
 ('BR14', 'BR14-021', '大直'),
 ('BR15', 'BR15-022', '劍南路'),
 ('BR16', 'BR16-023', '西湖'),
 ('BR17', 'BR17-024', '港墘'),
 ('BR18', 'BR18-025', '文德'),
 ('BR19', 'BR19-026', '內湖'),
 ('BR20', 'BR20-027', '大湖公園'),
 ('BR21', 'BR21-028', '葫洲'),
 ('BR22', 'BR22-029', '東湖'),
 ('BR23', 'BR23-030', '南港軟體園區'),
 ('BR24', 'BR24-031', '南港展覽館'),
 ('R02', 'R02-099', '象山'),
 ('R03', 'R03-100', '台北101/世貿'),
 ('R04', 'R04-101', '信義安和'),
 ('R05', 'R05-011', '大安'),
 ('R06', 'R06-103', '大安森林公園'),
 ('R07', 'R07-134', '東門'),
 ('R08', 'R08-042', '中正紀念堂'),
 ('R09', 'R09-050', '台大醫院'),
 ('R10', 'R10-051', '台北車站'),
 ('R11', 'R11-05

In [93]:
# 內網


stn_web_ID = "R10-051"
stn_url = "http://web.metro.taipei/c/stationdetail2010.asp?ID=" + stn_web_ID
stn_page_source = requests.get(stn_url)
stn_page_source.encoding = 'utf-8'
stn_soup = BeautifulSoup(stn_page_source.text,"html.parser")



In [94]:
# 車站基本資訊
# stn_attrs
# '車站名稱', '車站地址', '無障礙電梯位置', '詢問處位置', 
# '飲水臺位置', '廁所位置', '假日自行車進出'

def get_basic_list(stn_soup):

    basic_attrs = stn_soup.select('th[class="Default"] > img')
    basic_attrs = tuple(remove_dust(each['alt']) for each in basic_attrs)
    basic_attrs_len = len(basic_attrs)

    # 車站基本資訊

    basic_values = stn_soup.select('td[class="Default"]')
    basic_values = tuple(remove_dust(each.text) for each in basic_values)
    basic_values = basic_values[:basic_attrs_len]

    basic_list = [basic_attrs, basic_values]

    return basic_list

In [97]:
# exit_info
# 出口資訊

# def get_exit_list(stn_soup):

# pprint.pprint(stn_soup)

exit_info = stn_soup.select_one('img[alt="出口資訊"]')
exit_info = exit_info.find_parent('table')
# exit_info = exit_info.find_a

# exit_info = exit_info.find_all('td')

pprint.pprint((stn_soup))

# exit_info = exit_info.find_all('tr')

# exit_info_list = []

# # 出口資訊欄位名稱

# exit_info_names = tuple(remove_dust(each.text) for each in exit_info[0].select('th'))
# exit_info_list.append(exit_info_names)

# # 出口資訊每行資料

# for each in exit_info[1::]:
#     each_exit_value = tuple(remove_dust(every.text) for every in each.select('td'))
#     exit_info_list.append(each_exit_value)

# pprint.pprint(exit_info[1::])
# return exit_info_list


<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">

<html lang="zh-TW">
<head>
<meta content="text/html; charset=utf-8" http-equiv="Content-Type"/>
<title>路網圖、各站資訊及時刻表</title>
<!-- <script type="text/javascript" src="../js/metrowebga.js"></script> -->
<script src="js/jquery.min.js"></script>
<script>
  $(document).ready(function(){
    var countforli = ($(".tabs>li").length); 
    var heightoftabs = (countforli % 4) * 60 + 60;
    // alert(heightoftabs);
    $(".tabs").css("height", heightoftabs);
  });
</script>
<style type="text/css">
  .mid{
      vertical-align:middle;
  }
  
  .viewpoint {
    width: 100%;
    margin-top: 5px;
    border: #c0c0c0 1px solid; 
  }

  .viewpoint ul, li {
    margin: 1px;
    padding: 1px;
    list-style: none;
    /*height: 100px;*/
  }

  .viewpoint ul.tabs {
    width: 100%;
    margin-top: 5px;
  }

  .viewpoint ul.tabs li {
    float: left;
    height: 54px;
    line-height: 31px;
    overflow: hidden;
    position: relative;
    ma

In [29]:
# 轉乘停車場資訊
# parking_values
# *** Need to split the rows

def get_parking_list(stn_soup):

    parking = stn_soup.select_one('img[alt="轉乘停車資訊"]')
    parking = parking.find_parent('table')
    parking = parking.find('table')
    parking = parking.find_all('tr')

    parking_list = []

    parking_attrs = tuple(remove_dust(each.text) for each in parking[0].select('th'))
    parking_list.append(parking_attrs)

    for each in parking[1::]:
        parking_values = tuple(remove_dust(every.text) for every in each.select('td'))
        parking_list.append(parking_values)
    
    return parking_list

In [30]:
pprint.pprint(get_basic_list(stn_soup))

[('車站名稱', '車站地址', '無障礙電梯位置', '詢問處位置', '飲水臺位置', '廁所位置', '假日自行車進出'),
 ('台北車站',
  '10041臺北市中正區忠孝西路1段49號',
  '1號電梯：B2層（板南線大廳中央偏北處）>B4層（淡水信義線2號月臺南側）2號電梯：B3層（淡水信義線大廳中央）>B4層（淡水信義線月臺中央）3號電梯：地面層（出口M1及出口M2中間）>B3層（淡水信義線大廳北處）4號電梯：B1層（出口M4及出口M5中間）>B2層（板南線大廳西側）5號電梯：出口M4（忠孝西路靠臺鐵側）>B1層（星巴克）6號電梯：B2層（板南線大廳中央處）>B3層（板南線月臺中央）7號電梯：B1層（誠品書店）>B2層（板南線大廳東側）',
  '近出口M3、M7、M8，近忠孝西路近出口M4、M5、M6，近忠孝西路近出口M1、M2，近市民大道',
  '出口M4',
  '非付費區，近出口M1、M2付費區(5板南線)付費區(2淡水信義線)',
  '未開放')]


In [34]:
pprint.pprint(get_exit_list(stn_soup))

[('無障礙出口', '出口編號', '位置描述', '出入口型式', 'Google地圖', '備註')]


In [33]:
pprint.pprint(get_parking_list(stn_soup))

[('小汽車', '機車', '自行車'), ('0', '0', '0')]


In [None]:
# 定義 insert_urls() 方法

# 將爬下來的ID存到爬過的url紀錄中

# 參數: 
# db_name: 要塞的資料庫名稱(sqlite檔案)
# id_list: 含有內頁ID的list

def insert_urls(db_url_list, id_list):
    
    with sqlite3.connect(db_url_list) as conn_insert_url:
               
        c_iu = conn_insert_url.cursor()

        print(id_list)
        
        for each_id in id_list:
        
            try:
        
                today = datetime.datetime.today()
                today = today.strftime('%Y-%m-%d %H:%M:%S')

                values = "'http://www.century21.com.tw/index/Rental/RentPage/" + str(each_id) + "','" + str(today) + "'"       

                insertString = "INSERT INTO urlList (url, modDate) VALUES(" + values + ");"
                c_iu.execute(insertString)
            
            except Exception as e:
                
                print("例外: " + str(e))
                print(each_id)
            
    c_iu.close()

In [None]:
# 找出所有已經被爬過的內頁url

with sqlite3.connect(db_name) as retr_url:
        
    c = retr_url.cursor()

    c.execute("SELECT url from rental")
    url_completed = c.fetchall()
    
retr_url.close()
# url_completed[0][0][-6:]

In [None]:
# 定義 crawl_links_single() 方法
# 從一個外網頁面找出該頁面裡所有的內頁ID

def crawl_links_single(driver):

    soup = BeautifulSoup(driver.page_source,'html.parser')
    soup_hrefs = soup.select('div[class="main clearfix"] > div > a')
    hrefs = []

    for each in soup_hrefs:
        hrefs.append(each['href'].split('/')[-1])
#         hrefs.append(each['href'])

    return hrefs

In [None]:
# 定義 crawl_n_insert() 方法
# 爬內頁然後儲存內容到資料庫中

def crawl_n_insert(url, db_name):
    
#     driver = webdriver.PhantomJS()
#     driver.get(url)
#     time.sleep(2.5)
#     page_source = str(driver.page_source)
    page_source = requests.get(url)
    page_source = str(page_source.text)
    
    # 處理引號
    
    page_source = page_source.replace("\"","\"\"")
    page_source = page_source.replace('\'',"\'\'")
    
    with sqlite3.connect(db_name) as conn_insert_page:
               
        c_ip = conn_insert_page.cursor()
        
        try:

            today = datetime.datetime.today()
            today = today.strftime('%Y-%m-%d %H:%M:%S')

            values = "\"" + str(url) + "\",'" + page_source + "','','','" + str(today) + "'"

#             print(values)

            insertString = "INSERT INTO rental (url, soup, nosql, rdb, getTime) VALUES(" + values + ");"

#             print(insertString)

            c_ip.execute(insertString)

        except Exception as e:

            print(url)
            print("例外: " + str(e))
        finally:
            c_ip.close()

In [None]:
# 主程序1:
# 從外網把所有的內頁url爬下來

main_url = "http://www.century21.com.tw/index/Rental/Rent"

end_loop = False
driver = webdriver.PhantomJS()
driver.get(main_url)
time.sleep(2.5)

while not end_loop:
    
    try:
    
        print(driver.current_url)
        id_list = crawl_links_single(driver)
        return_count = len(id_list)

        if return_count > 0:
            insert_urls(db_url_list, id_list)
    
        driver.find_element_by_link_text("下一頁").click()
        time.sleep(1)
        
    except Exception as e:
        
        print("例外: " + str(driver.current_url) + " \r; " + str(e))
        end_loop = True
        
print("內頁url搜尋結束")

In [None]:
# 檢查 db_url_list 結果

with sqlite3.connect(db_url_list) as conn_url_list:
    
    c_ul = conn_url_list.cursor()
    qryString = "SELECT distinct count(*) from urlList limit 1;"
    c_ul.execute(qryString)
    results = c_ul.fetchall()

c_ul.close()

print((results))

In [None]:
# 主程序2:
# 從外網把所有的內頁內容爬下來

# 取得所有的url

with sqlite3.connect(db_url_list) as conn_call_list:
    
    c_cl = conn_url_list.cursor()
    qryString = "SELECT url from urlList;"
    c_cl.execute(qryString)
    results = c_cl.fetchall()

c_cl.close()

# 略過已經做過的網址
        
for each in results:
    try:
        print(each[0])
        crawl_n_insert(each[0],db_name)
    except Exception as e:
        print(e)

print("所有的內頁內容都已爬過且儲存到資料庫中")

In [None]:
# 檢查 db_name 結果

with sqlite3.connect(db_name) as conn_db_name:
    
    c_dn = conn_db_name.cursor()
    qryString = "SELECT count(url) from rental;"
    c_dn.execute(qryString)
    results = c_dn.fetchall()

c_dn.close()
    
print((results))