## import packages 

In [4]:
import requests
import re
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from fake_useragent import UserAgent

## Create functions 

### Get articles urls of whole year 
找出整年度各個article網址

In [10]:
def yaerly_url_lists(year, soup):
    
    # 先將擁有a的標籤留下來，再創造一個url字典 ("LRN": 完整url)
    if year < 2017 and year >= 2003:
        url_data = soup.select("tr a")
    elif year >= 2017:
        url_data = soup.select("table[id='mainlist'] a")
        
    try:
        url_dict = {i.text: "https://www.sec.gov" + i["href"] for i in url_data if re.match("^LR", i.text)}
        return url_dict
    
    except IndexError as e:
        print("IndexError!")

### Get elements of each articles
1. 取得各篇articles的元素(標題、發佈號碼、發文時間、額外資訊、內文)
2. 將無法爬取的append到error_articles

In [5]:
# For 新版
def get_elements_new(year, url_dict):
    error_articles = []
    output_data = []
    # headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
    ua = UserAgent()
    headers = {"user-agent": ua.google}

    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)
        
        # 建立beautifulsoup物件
        ele_url = url
        ele_res = requests.get(ele_url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            title = ele_soup.select("h1.alphaheads")[0].text
            subtitle_data = ele_soup.select("h2.alphaheads")
            release_num, date = subtitle_data[0].text.split("/")
            extra_info = ",".join([i.text for i in subtitle_data[1:]])
            content_data = ele_soup.select('div[id=main-content]')
            content = "\n".join([i.text for i in content_data[0].select("p")])

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, date, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
        
        time.sleep(1)
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

In [19]:
# For 舊版
def get_elements_old(year, url_dict):
    error_articles = []
    output_data = []
    ua = UserAgent()
    headers = {"user-agent": ua.google}

    ###### 隨機選取幾篇進行測試
    url_random = random.sample(url_dict.keys(), 30)
    
    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    # for LRN, url in url_dict.items():
    for LRN in url_random:
        print(LRN)
        url = url_dict[LRN]

        # 建立beautifulsoup物件
        ele_res = requests.get(url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            # 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag之後，從soup中extract出來，避免擷取內文時被再次抓到（2011以前都是如此）
            raw_data = []
            for i in range(1, 4):
                data = ele_soup.select(f"h{i}")
                if data:
                    for d in data:
                        raw_data.append(d.extract())

            # 從tag中取出文字
            elements = list(map(lambda x: x.text, raw_data))

            # 流程：將必要資訊拿出後，從list裡面remove掉，到最後剩下的element組成titile(無規則性，後續再進行文字處理)
            # 1. 找出不必要資訊並刪除
            useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
            elements.remove(useless)
        
            # 2. 號碼、日期
            result = [i for i in elements if re.match("^Litigation.*", i, flags=re.I)][0]
            elements.remove(result)
            release_num, date = result.split("/")

            # 3. extra info
            result = [i for i in elements if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
            elements.remove(result)
            extra_info = result

            # 4. 標題
            title = " ".join(elements)

            # 5. 取得內文 => 皆放在p標籤裡面，抓取全部並用join組成一個大字串，再以特定句進行split
            articles = " ".join([i.text for i in ele_soup.select("p")])

            if "For further information" in articles:
                content = articles.split(r"For further information")[0]
            elif "SEC Complaint" in articles:
                content = articles.split("SEC Complaint")[0]
            else:
                content = articles

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, date, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
        
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

## starting scrapying

<font color="red">注意: 主頁面格式&文章版面依據年份而不同，因此透過年份來判別使用哪個標籤進行爬取，目前越前面年份越容易出錯，仍有很大改善空間!</font>

輸出 => excel檔案，包含各年份的sheet

In [20]:
# 開始時間
start_time = time.time()

# 建立whole_error_articles字典存放無法爬取article
whole_error_articles = {}

# 用來接取year與df
df_lists = list()
for year in range(2003, 2015):
    print(year, "---"*20)

    # 每年度新聞稿網址略有不同: 2020以後 -> htm, 2020以前 -> shtml
    if year < 2020:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.shtml"
    else:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.htm"

    # 建立身分認證資料
    ua = UserAgent()
    headers = {"user-agent" : ua.google}

    # 加入headless模式 => 注意：使用時google版本可能不一樣
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument(ua.google) # headers

    # 將參數帶入
    driver = webdriver.Chrome(options=options, executable_path="../chromedriver")
    driver.get(main_url)
    time.sleep(2)

    # 取得頁面的html
    main_page_soup = BeautifulSoup(driver.page_source, "html.parser")

    # 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
    yearly_urls = yaerly_url_lists(year, main_page_soup)

    # 依照年份選擇使用functions
    if year >= 2018: # new
        yearly_LR_output = get_elements_new(year, yearly_urls)
    else: # old(until 2003)
        yearly_LR_output = get_elements_old(year, yearly_urls)
    
    # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
    df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "time", "extra_info", "content"])
    df_lists.append((year, df))
    
### 儲存為excel格式
# 將每年df APPEND到一個excel檔案裡面，每年的資料以年份當作頁面名稱，如果此資料夾不存在，照理會自動建立，但這台電腦好像無法~
with pd.ExcelWriter("./test0905.xlsx", mode="w") as writer:
    for d in df_lists:
        d[1].to_excel(writer, index=False, sheet_name=f'{d[0]}')

# 結束時間
end_time = time.time()

# 總花費時間
print("總花費時間: ", end_time - start_time)

2003 ------------------------------------------------------------
LR-18280
LR-18322
有問題無法爬取 =>  LR-18322
LR-18220
LR-18167
LR-18193
LR-18487
LR-18519
有問題無法爬取 =>  LR-18519
LR-18438
LR-18131
有問題無法爬取 =>  LR-18131
LR-17969
LR-18398
有問題無法爬取 =>  LR-18398
LR-18214
LR-17942
LR-18015
LR-17984
有問題無法爬取 =>  LR-17984
LR-17940
有問題無法爬取 =>  LR-17940
LR-18148
LR-18135
有問題無法爬取 =>  LR-18135
LR-18479
LR-18406
LR-18399
LR-18007
有問題無法爬取 =>  LR-18007
LR-18304
LR-18149
LR-17920
有問題無法爬取 =>  LR-17920
LR-18428
LR-18053
有問題無法爬取 =>  LR-18053
LR-18093
有問題無法爬取 =>  LR-18093
LR-18414
LR-18192
2004 ------------------------------------------------------------
LR-18657
LR-18846
LR-18587
有問題無法爬取 =>  LR-18587
LR-18919
LR-18709
LR-18853
LR-18885
LR-18990
LR-18851
LR-19001
LR-18916
LR-18826
LR-18857
LR-18834A
LR-18807
LR-18537
LR-19009
LR-18792
LR-18830
LR-18641
LR-18981
LR-18633
LR-18768
LR-18538
有問題無法爬取 =>  LR-18538
LR-18733A
LR-18822
LR-18556
LR-18991
LR-18955
LR-18683
2005 ------------------------------------------------

### Check out the articles that can't be scrapyed

In [23]:
for key, value in whole_error_articles.items():
    print(f'{key}年無法爬取篇數&失敗率: {len(value)}, {round(len(value)/30*100, 2)}%')

2003年無法爬取篇數&失敗率: 11, 36.67%
2004年無法爬取篇數&失敗率: 2, 6.67%
2005年無法爬取篇數&失敗率: 6, 20.0%
2006年無法爬取篇數&失敗率: 4, 13.33%
2007年無法爬取篇數&失敗率: 2, 6.67%
2008年無法爬取篇數&失敗率: 3, 10.0%
2009年無法爬取篇數&失敗率: 0, 0.0%
2010年無法爬取篇數&失敗率: 1, 3.33%
2011年無法爬取篇數&失敗率: 0, 0.0%
2012年無法爬取篇數&失敗率: 3, 10.0%
2013年無法爬取篇數&失敗率: 0, 0.0%
2014年無法爬取篇數&失敗率: 0, 0.0%


### 測試使用

In [25]:
## 使用selenium
main_url = "https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2020.htm"

# 加入headless模式 => 注意：使用時google版本可能不一樣
options = webdriver.ChromeOptions()
options.add_argument("--headless")
ua = UserAgent()
options.add_argument(ua.google) # headers

# 將參數帶入
driver = webdriver.Chrome(options=options, executable_path="../chromedriver")
driver.get(main_url)
time.sleep(2)

# 取得頁面的html
main_page_soup = BeautifulSoup(driver.page_source, "html.parser")

# 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
yearly_urls = yaerly_url_lists(2020, main_page_soup)
yearly_LR_output = get_elements_new(2020, yearly_urls)

yearly_urls
# # # # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "date", "extra_info", "content"])
#df_lists.append((2020, df))
df.head()

LR-25003
LR-25002
LR-25001
LR-25000
LR-24999
LR-24998
LR-24997
LR-24996
LR-24995
LR-24994
LR-24993
LR-24992
LR-24991
LR-24990
LR-24989
LR-24988
LR-24987
LR-24986
LR-24985
LR-24984
LR-24983
LR-24982
LR-24981
LR-24980
LR-24979
LR-24978
LR-24977
LR-24976
有問題無法爬取 =>  LR-24976
LR-24975
LR-24974
LR-24973


KeyboardInterrupt: 