## import packages 

In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
import time

## Create functions 

### Get articles urls of whole year 
找出整年度各個article網址

In [10]:
def yaerly_url_lists(year, soup):
    
    # 先將擁有a的標籤留下來，再創造一個url字典 ("LRN": 完整url)
    if year < 2017 and year >= 2011:
        url_data = soup.select("tr a")
    elif year >= 2017:
        url_data = soup.select("table[id=mainlist] a")
        
    try:
        url_dict = {i.text: "https://www.sec.gov" + i["href"] for i in url_data if re.match("^LR", i.text)}
        return url_dict
    
    except IndexError as e:
        print("IndexError!")

### Get elements of each articles
1. 取得各篇articles的元素(標題、發佈號碼、發文時間、額外資訊、內文)
2. 將無法爬取的append到error_articles

In [11]:
def get_elements(year, url_dict):
    error_articles = []
    output_data = []
    
    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)
        
        # 建立beautifulsoup物件
        ele_url = url
        ele_res = requests.get(ele_url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            if 2011 <= year <= 2017:
                title = ele_soup.select("h3")[0].text
                subtitle_data = ele_soup.select("h2")
                release_num, time = list(map(lambda x: x.strip(" "), subtitle_data[1].text.split("/")))
                extra_info = ",".join([i.text for i in subtitle_data[1:]])
                content_data = ele_soup.select('p')[:-1]
                content = "\n".join([i.text for i in content_data])
                
            elif 2018 <= year:                
                title = ele_soup.select("h1.alphaheads")[0].text
                subtitle_data = ele_soup.select("h2.alphaheads")
                release_num, time = subtitle_data[0].text.split("/")
                extra_info = ",".join([i.text for i in subtitle_data[1:]])
                content_data = ele_soup.select('div[id=main-content]')
                content = "\n".join([i.text for i in content_data[0].select("p")])

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, time, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
    
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

## starting scrapying

<font color="red">注意: 主頁面格式&文章版面依據年份而不同，因此透過年份來判別使用哪個標籤進行爬取，目前越前面年份越容易出錯，仍有很大改善空間!</font>

輸出 => excel檔案，包含各年份的sheet

In [12]:
# 開始時間
start_time = time.time()

# 建立whole_error_articles字典存放無法爬取article
whole_error_articles = {}

df_lists = list()
for year in range(2017, 2021):
    print(year, "---"*20)

    # 每年度新聞稿網址略有不同: 2020以後 -> htm、 2020以前 -> shtml
    if year < 2020:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.shtml"
    else:
        main_url = f"https://www.sec.gov/litigation/litreleases/litrelarchive/litarchive{year}.htm"

    # 建立身分認證資料
    UserAgent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
    headers = {"User-Agent" : UserAgent}

    # 開始爬蟲
    res = requests.get(main_url, headers=headers)
    main_page_soup = BeautifulSoup(res.text, features="html.parser")
    
    # 透過yearly_url_lists函數取得該年度所有新聞稿網址，再丟入get_elements函數中取得文章所有資料
    yearly_urls = yaerly_url_lists(year, main_page_soup)
    yearly_LR_output = get_elements(year, yearly_urls)

    # 將yearly_LR_output轉成df放入pandas，並寫入csv檔中
    df = pd.DataFrame(yearly_LR_output, columns=["title", "release_num", "time", "extra_info", "content"])
    df_lists.append((year, df))
    
### 儲存為excel格式
# 將每年df APPEND到一個excel檔案裡面，每年的資料以年份當作頁面名稱，如果此資料夾不存在，照理會自動建立，但這台電腦好像無法~
with pd.ExcelWriter("./test0517.xlsx", mode="w") as writer:
    for d in df_lists:
        d[1].to_excel(writer, index=False, sheet_name=f'{d[0]}')

# 結束時間
end_time = time.time()

# 總花費時間
print("總花費時間: ", end_time - start_time)

2017 ------------------------------------------------------------
LR-24022
LR-24021
LR-24020
有問題無法爬取 =>  LR-24020
LR-24019
有問題無法爬取 =>  LR-24019
LR-24018
LR-24017
LR-24016
LR-24015
LR-24014
LR-24013
LR-24012
LR-24011
LR-24010
LR-24009
LR-24008
LR-24007
LR-24006
LR-24005
LR-24004
LR-24003
LR-24002
LR-24001
LR-24000
LR-23999
LR-23998
LR-23997
LR-23996
LR-23995
LR-23994
LR-23993
LR-23992
LR-23991
LR-23990
LR-23989
LR-23988
LR-23987
LR-23986
LR-23985
LR-23984
LR-23983
LR-23982
LR-23981
LR-23980
LR-23979
LR-23978
LR-23977
LR-23976
有問題無法爬取 =>  LR-23976
LR-23975
LR-23974
LR-23973
LR-23972
LR-23971
LR-23970
LR-23969
LR-23968
LR-23967
LR-23966
LR-23965
LR-23964
LR-23963
LR-23962
LR-23961
LR-23960
LR-23959
LR-23958
LR-23957
LR-23956
LR-23955
LR-23954
LR-23953
LR-23952
LR-23951
LR-23950
LR-23949
LR-23948
LR-23947
LR-23946
LR-23945
LR-23944
LR-23943
LR-23942
LR-23941
LR-23940
LR-23939
LR-23938
LR-23937
LR-23936
LR-23935
LR-23934
LR-23933
LR-23932
LR-23931
LR-23930
LR-23929
LR-23928
LR-23927
有問題無法爬取

LR-24545
LR-24544
LR-24543
LR-24542
LR-24541
LR-24540
LR-24539
LR-24538
LR-24537
LR-24536
LR-24535
LR-24534
LR-24533
LR-24532
LR-24531
LR-24530
LR-24529
LR-24528
LR-24527
LR-24526
LR-24525
LR-24524
LR-24523
LR-24522
LR-24521
LR-24520
LR-24519
LR-24518
LR-24517
LR-24516
LR-24515
LR-24514
LR-24513
LR-24512
LR-24511
LR-24510
LR-24509
LR-24508
LR-24507
有問題無法爬取 =>  LR-24507
LR-24506
LR-24505
LR-24504
LR-24503
LR-24502
LR-24501
LR-24500
LR-24499
LR-24498
LR-24497
LR-24496
LR-24495
LR-24494
LR-24493
LR-24492
LR-24491
LR-24490
LR-24489
LR-24488
LR-24487
LR-24486
LR-24485
LR-24484
LR-24483
LR-24482
LR-24481
LR-24480
LR-24479
LR-24478
LR-24477
LR-24476
LR-24475
LR-24474
LR-24473
LR-24472
LR-24471
LR-24470
LR-24469
LR-24468
有問題無法爬取 =>  LR-24468
LR-24467
LR-24466
LR-24465
LR-24464
LR-24463
LR-24462
LR-24461
LR-24460
LR-24459
LR-24458
LR-24457
LR-24456
LR-24455
LR-24454
LR-24453
LR-24452
LR-24451
LR-24450
LR-24449
LR-24448
LR-24447
LR-24446
LR-24445
LR-24444
LR-24443
LR-24442
LR-24441
LR-24440
LR-2

### Check out the articles that can't be scrapyed

In [9]:
for k in whole_error_articles.keys():
    print(len(whole_error_articles[k]))

79
177
44
27
30
33


In [13]:
for k in whole_error_articles.keys():
    print(len(whole_error_articles[k]))

12
14
6
5
