## import packages 

In [1]:
import requests
import re
import pandas as pd
import time
import random
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from fake_useragent import UserAgent

## Create functions 

### Get articles urls of whole year 
找出整年度各個article網址

In [7]:
def yaerly_url_lists(year, soup):
    
    # 先將擁有a的標籤留下來，再創造一個url字典 ("LRN": 完整url)
    if year < 2017 and year >= 2000:
        url_data = soup.select("tr a")
    elif year >= 2017:
        url_data = soup.select("table[id='mainlist'] a")
        
    try:
        url_dict = {i.text: "https://www.sec.gov" + i["href"] for i in url_data if re.match("^LR", i.text)}
        return url_dict
    
    except IndexError as e:
        print("IndexError!")

### Get elements of each articles
1. 取得各篇articles的元素(標題、發佈號碼、發文時間、額外資訊、內文)
2. 將無法爬取的append到error_articles

In [3]:
# For 新版
def get_elements_new(year, url_dict):
    error_articles = []
    output_data = []
    # headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
    ua = UserAgent()
    headers = {"user-agent": ua.google}

    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)
        
        # 建立beautifulsoup物件
        ele_url = url
        ele_res = requests.get(ele_url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            title = ele_soup.select("h1.alphaheads")[0].text
            subtitle_data = ele_soup.select("h2.alphaheads")
            release_num, date = subtitle_data[0].text.split("/")
            extra_info = ",".join([i.text for i in subtitle_data[1:]])
            content_data = ele_soup.select('div[id=main-content]')
            content = "\n".join([i.text for i in content_data[0].select("p")])

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, date, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
        
        time.sleep(1)
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

In [8]:
# For 舊版
def get_elements_old(year, url_dict):
    error_articles = []
    output_data = []
    ua = UserAgent()
    headers = {"user-agent": ua.google}

    ###### 隨機選取幾篇進行測試
    n = 50
    url_random = random.sample(url_dict.keys(), n)
    
    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    # for LRN, url in url_dict.items():
    for LRN in url_random:
        print(LRN)
        url = url_dict[LRN]

        # 建立beautifulsoup物件
        ele_res = requests.get(url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            # 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag之後，從soup中extract出來，避免擷取內文時被再次抓到（2011以前都是如此）
            raw_data = []
            for i in range(1, 4):
                data = ele_soup.select(f"h{i}")
                if data:
                    for d in data:
                        raw_data.append(d.extract())

            # 從tag中取出文字
            elements = list(map(lambda x: x.text, raw_data))

            # 流程：將必要資訊拿出後，從list裡面remove掉，到最後剩下的element組成titile(無規則性，後續再進行文字處理)
            # 1. 找出不必要資訊並刪除
            useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
            elements.remove(useless)
        
            # 2. 號碼、日期
            result = [i for i in elements if re.match("^Litigation.*", i, flags=re.I)][0]
            elements.remove(result)
            release_num, date = result.split("/")

            # 3. extra info
            result = [i for i in elements if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
            elements.remove(result)
            extra_info = result

            # 4. 標題
            title = " ".join(elements)

            # 5. 取得內文 => 皆放在p標籤裡面，抓取全部並用join組成一個大字串，再以特定句進行split
            articles = " ".join([i.text for i in ele_soup.select("p")])

            if "For further information" in articles:
                content = articles.split(r"For further information")[0]
            elif "SEC Complaint" in articles:
                content = articles.split("SEC Complaint")[0]
            else:
                content = articles

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, date, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            # print("有問題無法爬取 => ", LRN)
        
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

## starting scrapying

<font color="red">注意: 主頁面格式&文章版面依據年份而不同，因此透過年份來判別使用哪個標籤進行爬取，目前越前面年份越容易出錯，仍有很大改善空間!</font>

輸出 => excel檔案，包含各年份的sheet

In [13]:
# 開始時間
start_time = time.time()

# 建立whole_error_articles字典存放無法爬取article
whole_error_articles = {}

# 用來接取year與df
df_lists = list()
for year in range(2000, 2017):
    print(year, "---"*20)

    # 每年度新聞稿網址略有不同: 2020以後 -> htm, 2020以前 -> shtml
    if year < 2020:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.shtml"
    else:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.htm"

    # 建立身分認證資料
    ua = UserAgent()
    headers = {"user-agent" : ua.google}

    # 加入headless模式 => 注意：使用時google版本可能不一樣
    options = webdriver.ChromeOptions()
    options.add_argument("--headless")
    options.add_argument(ua.google) # headers

    # 將參數帶入
    driver = webdriver.Chrome(options=options, executable_path="../chromedriver")
    driver.get(main_url)
    time.sleep(2)

    # 取得頁面的html
    main_page_soup = BeautifulSoup(driver.page_source, "html.parser")

    # 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
    yearly_urls = yaerly_url_lists(year, main_page_soup)

    # 依照年份選擇使用functions
    if year >= 2018: # new
        yearly_LR_output = get_elements_new(year, yearly_urls)
    else: # old(until 2003)
        yearly_LR_output = get_elements_old(year, yearly_urls)
    
    # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
    df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "time", "extra_info", "content"])
    df_lists.append((year, df))
    
### 儲存為excel格式
# 將每年df APPEND到一個excel檔案裡面，每年的資料以年份當作頁面名稱，如果此資料夾不存在，照理會自動建立，但這台電腦好像無法~
with pd.ExcelWriter("./test0905.xlsx", mode="w") as writer:
    for d in df_lists:
        d[1].to_excel(writer, index=False, sheet_name=f'{d[0]}')

# 結束時間
end_time = time.time()

# 總花費時間
print("總花費時間: {} sec".format(round(end_time - start_time, 2)))

2000 ------------------------------------------------------------
LR-16719
LR-16488
LR-16694
LR-16716
LR-16815
LR-16638
LR-16763
LR-16686
LR-16720
LR-16589
LR-16601
LR-16680
LR-16682
LR-16413
LR-16778A
LR-16825
LR-16518
LR-16629
LR-16454
LR-16481
LR-16645
LR-16641
LR-16462
LR-16830
LR-16588
LR-16813
LR-16519
LR-16698
LR-16436
LR-16424
LR-16668
LR-16489
LR-16399
LR-16750
LR-16692
LR-16814
LR-16407
LR-16840
LR-16471
LR-16423
LR-16622
LR-16564
LR-16712
LR-16544
LR-16510
LR-16477
LR-16663
LR-16721
LR-16833
LR-16729
2001 ------------------------------------------------------------
LR-16881
LR-17154
LR-16939
LR-17234
LR-17001
LR-17010
LR-16969
LR-17100
LR-16983
LR-17134
LR-16932
LR-17017
LR-17009
LR-16885
LR-17092
LR-17021
LR-17237
LR-16944
LR-17192
LR-17153
LR-17219
LR-16906
LR-17275
LR-16938
LR-17070
LR-16945
LR-17027
LR-17156
LR-17152
LR-17069
LR-17211
LR-17193
LR-17249
LR-17206
LR-17099
LR-16916
LR-17045
LR-17273
LR-17175
LR-17230
LR-17110
LR-16904
LR-17177
LR-17145
LR-16900
LR-17200
LR-

## 隨機測試結果 => 每年隨機選取50篇（2000~2017）

In [15]:
overall_result = pd.DataFrame(columns= ["Total Fail", "Fail Rate(%)"])

for key, value in whole_error_articles.items():
    overall_result.loc[key] = {"Total Fail":len(value), "Fail Rate(%)":round(len(value)/50*100, 1)}

print(overall_result)

      Total Fail  Fail Rate(%)
2000        50.0         100.0
2001        46.0          92.0
2002        38.0          76.0
2003        19.0          38.0
2004         5.0          10.0
2005         7.0          14.0
2006         4.0           8.0
2007         1.0           2.0
2008         5.0          10.0
2009         1.0           2.0
2010         6.0          12.0
2011         1.0           2.0
2012         3.0           6.0
2013         0.0           0.0
2014         0.0           0.0
2015         0.0           0.0
2016         3.0           6.0


### 測試使用

In [25]:
## 使用selenium
main_url = "https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive2020.htm"

# 加入headless模式 => 注意：使用時google版本可能不一樣
options = webdriver.ChromeOptions()
options.add_argument("--headless")
ua = UserAgent()
options.add_argument(ua.google) # headers

# 將參數帶入
driver = webdriver.Chrome(options=options, executable_path="../chromedriver")
driver.get(main_url)
time.sleep(2)

# 取得頁面的html
main_page_soup = BeautifulSoup(driver.page_source, "html.parser")

# 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
yearly_urls = yaerly_url_lists(2020, main_page_soup)
yearly_LR_output = get_elements_new(2020, yearly_urls)

yearly_urls
# # # # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "date", "extra_info", "content"])
#df_lists.append((2020, df))
df.head()

LR-25003
LR-25002
LR-25001
LR-25000
LR-24999
LR-24998
LR-24997
LR-24996
LR-24995
LR-24994
LR-24993
LR-24992
LR-24991
LR-24990
LR-24989
LR-24988
LR-24987
LR-24986
LR-24985
LR-24984
LR-24983
LR-24982
LR-24981
LR-24980
LR-24979
LR-24978
LR-24977
LR-24976
有問題無法爬取 =>  LR-24976
LR-24975
LR-24974
LR-24973


KeyboardInterrupt: 