## import packages 

In [None]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import time

## Create functions 

### Get articles urls of whole year 
找出整年度各個article網址

In [None]:
def yaerly_url_lists(year, soup):
    
    # 先將擁有a的標籤留下來，再創造一個url字典 ("LRN": 完整url)
    if year < 2017 and year >= 2011:
        url_data = soup.select("tr a")
    elif year >= 2017:
        url_data = soup.select("table[id=mainlist] a")
        
    try:
        url_dict = {i.text: "https://www.sec.gov" + i["href"] for i in url_data if re.match("^LR", i.text)}
        return url_dict
    
    except IndexError as e:
        print("IndexError!")

### Get elements of each articles
1. 取得各篇articles的元素(標題、發佈號碼、發文時間、額外資訊、內文)
2. 將無法爬取的append到error_articles

In [None]:
def get_elements(year, url_dict):
    error_articles = []
    output_data = []
    
    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)
        
        # 建立beautifulsoup物件
        ele_url = url
        ele_res = requests.get(ele_url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            if 2011 <= year <= 2017:
                title = ele_soup.select("h3")[0].text
                subtitle_data = ele_soup.select("h2")
                release_num, time = list(map(lambda x: x.strip(" "), subtitle_data[1].text.split("/")))
                extra_info = ",".join([i.text for i in subtitle_data[1:]])
                content_data = ele_soup.select('p')[:-1]
                content = "\n".join([i.text for i in content_data])
                
            elif 2018 <= year:                
                title = ele_soup.select("h1.alphaheads")[0].text
                subtitle_data = ele_soup.select("h2.alphaheads")
                release_num, time = subtitle_data[0].text.split("/")
                extra_info = ",".join([i.text for i in subtitle_data[1:]])
                content_data = ele_soup.select('div[id=main-content]')
                content = "\n".join([i.text for i in content_data[0].select("p")])

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, time, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
    
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

## starting scrapying

<font color="red">注意: 主頁面格式&文章版面依據年份而不同，因此透過年份來判別使用哪個標籤進行爬取，目前越前面年份越容易出錯，仍有很大改善空間!</font>

輸出 => excel檔案，包含各年份的sheet

In [None]:
# 開始時間
start_time = time.time()

# 建立whole_error_articles字典存放無法爬取article
whole_error_articles = {}

df_lists = list()
for year in range(2011, 2021):
    print(year, "---"*20)

    # 每年度新聞稿網址略有不同: 2020以後 -> htm、 2020以前 -> shtml
    if year < 2020:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.shtml"
    else:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.htm"

    # 建立身分認證資料
    UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
    headers = {"User-Agent" : UserAgent}

    # 開始爬蟲
    res = requests.get(main_url, headers=headers)
    main_page_soup = BeautifulSoup(res.text, features="html.parser")
    
    # 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
    yearly_urls = yaerly_url_lists(year, main_page_soup)
    yearly_LR_output = get_elements(year, yearly_urls)

    # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
    df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "time", "extra_info", "content"])
    df_lists.append((year, df))
    
### 儲存為excel格式
# 將每年df APPEND到一個excel檔案裡面，每年的資料以年份當作頁面名稱，如果此資料夾不存在，照理會自動建立，但這台電腦好像無法~
with pd.ExcelWriter("./test0518(2005~2010).xlsx", mode="w") as writer:
    for d in df_lists:
        d[1].to_excel(writer, index=False, sheet_name=f'{d[0]}')

# 結束時間
end_time = time.time()

# 總花費時間
print("總花費時間: ", end_time - start_time)

### Check out the articles that can't be scrapyed

In [None]:
for key, value in whole_error_articles.items():
    print(f'{key}年無法爬取篇數: {len(value)}')