# Python機器學習與深度學習入門 
## Q1-2自由習作：PTT標題爬蟲 with IP城市搜尋
## by R07522717 林温雅

Package & Function

In [1]:
import requests
import time
import re
from bs4 import BeautifulSoup
import os

def get_web_page(url):
    resp = requests.get(
        url=url,
        cookies={'over18': '1'}
    )
    if resp.status_code != 200:
        print('Invalid url:', resp.url)
        return None
    else:
        return resp.text

def get_ip(dom):
    pattern = '來自: \d+\.\d+\.\d+\.\d+'
    match = re.search(pattern, dom)
    if match:
        return match.group(0).replace('來自: ', '')
    else:
        return None

def get_city(ip):
    if ip:
        url = 'http://api.ipstack.com/{}?access_key={}'.format(ip, API_KEY)
        data = requests.get(url).json()
        city_name = data['city'] if data['city'] else None
        return city_name
    return None

def get_articles(dom, date):
    # 只要存title和href即可，用title判斷有沒有要的字，用href進去找發文者的id
    soup = BeautifulSoup(dom, 'html5lib')
    # 取得上一頁的連結
    # ptt網頁版的換頁按鈕們在<div class="btn-group btn-group-paging">內，而前一頁在第二個，取之連結
    paging_div = soup.find('div', 'btn-group btn-group-paging')
    prev_url = paging_div.find_all('a')[1]['href']
    divs = soup.find_all('div', 'r-ent')
    articles = []  # 儲存取得的文章資料
    for d in divs:
        if d.find('a') and d.find('div', 'date').text.strip() == date:  # 有超連結，表示文章存在，未被刪除，且發文日期正確
            href = d.find('a')['href']
            title = d.find('a').text
            articles.append({
                'title': title,
                'href': href,
            })
    return articles, prev_url

Let's go!

In [2]:
PTT_URL = 'https://www.ptt.cc'
Board = str(input('請輸入看板(ex. Gossiping) ： '))
Target = str(input('請輸入標題搜尋標的 ： '))

# https://ipstack.com/
API_KEY = os.environ.get('IPSTACK_API_KEY')

print('Loading...')
current_page = get_web_page(PTT_URL + '/bbs/' + Board + '/index.html')
if current_page: # if url is valid
    articles = []
    today = time.strftime('%m/%d', time.localtime()).lstrip('0')  # 今天日期, 去掉開頭的 '0' 以符合 PTT 網站格式
    current_articles, prev_url = get_articles(current_page, today)  # 目前頁面的今日文章
    while current_articles:  # 若目前頁面有今日文章則加入 articles，並回到上一頁繼續尋找是否有今日文章
        articles += current_articles
        current_page = get_web_page(PTT_URL + prev_url)
        current_articles, prev_url = get_articles(current_page, today)
    print(f'截至 {time.strftime("%H:%M:%S 為止，%Y/%m/%d當日", time.localtime())}{Board}板共有{len(articles)}篇文章')

    # 進入目標文章尋找發文者IP
    city_dict = dict()
    total = 0
    for article in articles:
        if Target in article['title']:
            page = get_web_page(PTT_URL + article['href'])
            if page:
                ip = get_ip(page)
                city = get_city(ip) if get_city(ip) else 'Unknown'
                if city in city_dict.keys():
                    city_dict[city] += 1
                else:
                    city_dict[city] = 1
                total+=1
    print(f'其中標題含有「{Target}」的文章共有{total}篇')
    print(f'{total}篇中，發文者IP所屬城市分布為\n')
    k, v, p = 'City', 'Count', 'Percentage'
    print(f'{k:^20}|{v:^15}|{p:^15}')
    print('-------------------------------------------------------')
    city_sorted_list = sorted(city_dict.items(), key=lambda d: d[1], reverse=True)
    for item in city_sorted_list:
        # print(k, v, f'佔 {v/total:.2f} %')
        # f-string alignment
        # https://medium.com/@NirantK/best-of-python3-6-f-strings-41f9154983e
        print(f'{item[0]:^20}|{item[1]:^15}|{item[1]*100/ total:^15.2f}%')
        print('-------------------------------------------------------')

請輸入看板(ex. Gossiping) ： Gossiping
請輸入標題搜尋標的 ： 韓國瑜
Loading...
截至 22:17:05 為止，2019/03/07當日Gossiping板共有1846篇文章
其中標題含有「韓國瑜」的文章共有89篇
89篇中，發文者IP所屬城市分布為

        City        |     Count     |  Percentage   
-------------------------------------------------------
       Taipei       |      30       |     33.71     %
-------------------------------------------------------
      Unknown       |       8       |     8.99      %
-------------------------------------------------------
      Taichung      |       8       |     8.99      %
-------------------------------------------------------
  Taoyuan District  |       7       |     7.87      %
-------------------------------------------------------
   Kaohsiung City   |       6       |     6.74      %
-------------------------------------------------------
    Tainan City     |       5       |     5.62      %
-------------------------------------------------------
     Singapore      |       3       |     3.37      %
-------------------------------

## Reference
* [Python：網路爬蟲與資料分析入門實戰](https://www.books.com.tw/products/0010800867)
* Hiding Passwords and Secret Keys in Environment Variables ([Mac & Linux](https://www.youtube.com/watch?v=5iWhQWVXosU)) ([Windows](https://www.youtube.com/watch?v=IolxqkL7cD8))