In [1]:
import requests
import re
import pandas as pd
from bs4 import BeautifulSoup
from time import sleep
from selenium import webdriver
from selenium.webdriver.support.ui import Select
from fake_useragent import UserAgent

In [None]:
# 針對新版
def get_elements_new(year, url_dict):
    error_articles = []
    output_data = []
    headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
    
    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)
        
        # 建立beautifulsoup物件
        ele_url = url
        ele_res = requests.get(ele_url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            title = ele_soup.select("h1.alphaheads")[0].text
            subtitle_data = ele_soup.select("h2.alphaheads")
            release_num, time = subtitle_data[0].text.split("/")
            extra_info = ",".join([i.text for i in subtitle_data[1:]])
            content_data = ele_soup.select('div[id=main-content]')
            content = "\n".join([i.text for i in content_data[0].select("p")])

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, time, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
        
        sleep(1)
    # 將當年度error_article依年份存入whole_error_articles中        
    #whole_error_articles[year] = error_articles
    
    return output_data

In [None]:
# 針對舊版
def get_elements_old(year, url_dict):
    error_articles = []
    output_data = []
    ua = UserAgent()
    headers = {"user-agent": ua.google} # 每年用不同的user-agent去爬

    # 從url_dict中取得LRN, Url => 取得LRN號碼用意為錯誤發生時可知道是哪篇無法爬取
    for LRN, url in url_dict.items():
        print(LRN)

        # 建立beautifulsoup物件
        ele_res = requests.get(url, headers=headers)
        ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

        # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
        try:
            # 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag之後，從soup中extract出來，避免擷取內文時被再次抓到（2011以前都是如此）
            raw_data = []
            for i in range(1, 4):
                data = ele_soup.select(f"h{i}")
                if data:
                    for d in data:
                        raw_data.append(d.extract())

            # 從tag中取出文字
            elements = list(map(lambda x: x.text, raw_data))

            # 流程：將必要資訊拿出後，從list裡面remove掉，到最後剩下的element組成titile(無規則性，後續再進行文字處理)
            # 1. 找出不必要資訊並刪除
            useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
            elements.remove(useless)
            # print(elements)

            # 2. 號碼、日期
            result = [i for i in elements if re.match("^Litigation.*", i, flags=re.I)][0]
            elements.remove(result)
            release_num, date = result.split("/")

            # 3. extra info
            result = [i for i in elements if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
            elements.remove(result)
            extra_info = result

            # 4. 標題
            title = " ".join(elements)

            # 5. 取得內文 => 皆放在p標籤裡面，抓取全部並用join組成一個大字串，再以特定句進行split
            articles = " ".join([i.text for i in ele_soup.select("p")])

            if "For further information" in articles:
                content = articles.split(r"For further information")[0]
            elif "SEC Complaint" in articles:
                content = articles.split("SEC Complaint")[0]
            else:
                pass

            # 將每篇article資料放入輸出資料，待會轉成dataframe
            output_data.append([title, release_num, date, extra_info, content])

        # 跳過indexerror => append進入error_articles、回傳LRN編號
        except (IndexError, ValueError) as e:
            error_articles.append((LRN, url))
            print("有問題無法爬取 => ", LRN)
        
    # 將當年度error_article依年份存入whole_error_articles中        
    whole_error_articles[year] = error_articles
    
    return output_data

In [568]:
# 舊版 - 單頁元素抓取
def single_page_element(url):
    output_data = []
        
    # 建立beautifulsoup物件
    # headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
    ua = UserAgent()
    headers = {"user-agent": ua.google}
    ele_res = requests.get(url, headers=headers)
    ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

    # 取得 title, release_num, time, extra_info, content => use try-except to filter the error message
    try:
        # 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag
        raw_data = []
        for i in range(1, 4):
            data = ele_soup.select(f"h{i}")
            if data:
                for d in data:
                    raw_data.append(d.extract())

        # 從tag中取出文字
        elements = list(map(lambda x:x.text, raw_data))

        # 流程：將必要資訊拿出後，從list裡面remove掉，到最後剩下的element組成titile(無規則性，後續再進行文字處理)
        # 1. 找出不必要資訊並刪除
        useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
        elements.remove(useless)
        # print(elements)

        # 2. 號碼、日期
        result = [i for i in elements if re.match("^Litigation.*", i, flags=re.I)][0]
        elements.remove(result)
        release_num, date = result.split("/")

        # 3. extra info
        result = [i for i in elements if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
        elements.remove(result)
        extra_info = result

        # 4. 標題
        title = " ".join(elements)

        # 取得內文 => 皆放在p標籤裡面，抓取全部並用join組成一個大字串，再以特定句進行split
        articles = " ".join([i.text for i in ele_soup.select("p")])

        if "For further information" in articles:
            content = articles.split(r"For further information")[0]
        elif "SEC Complaint" in articles:
            content = articles.split("SEC Complaint")[0]
        else:
            pass

        # 將每篇article資料放入輸出資料，待會轉成dataframe
        output_data.append([title, release_num, date, extra_info, content])

    except:
        print("error happened")
    
    return output_data

In [564]:
url = "https://www.sec.gov/litigation/litreleases/lr18136.htm" #"https://www.sec.gov/litigation/litreleases/2011/lr22213.htm"
final_result  = single_page_element(url)

df = pd.DataFrame(final_result, columns=["title", "release_num", "time", "extra_info", "content"])
df

error happened


Unnamed: 0,title,release_num,time,extra_info,content


In [567]:
#### 測試！！！！
url = "https://www.sec.gov/litigation/litreleases/lr18322.htm" #"https://www.sec.gov/litigation/litreleases/2017/lr23994.htm"

headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
ele_res = requests.get(url, headers=headers)
ele_soup = BeautifulSoup(ele_res.text, features="html.parser")

# 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag
raw_data = []
for i in range(1, 4):
    data = ele_soup.select(f"h{i}")        
    if data:
        raw_data.extend(data)

# 從tag中取出文字
elements = list(map(lambda x:x.text, raw_data))

# 流程：將必要資訊拿出後，從list裡面remove掉，到最後剩下的element組成titile(無規則性，後續再進行文字處理)
# 1. 找出不必要資訊並刪除
useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
elements.remove(useless)
print(elements)

# 2. 號碼、日期
result = [i for i in elements if re.match("^Litigation.*", i, flags=re.I)][0]
elements.remove(result)
number, date = result.split("/")

# 3. extra info
result = [i for i in elements if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
elements.remove(result)
extra_info = result

# 4. 標題
title = " ".join(elements)

# 取得內文 => 皆放在p標籤裡面，抓取全部並用join組成一個大字串，再以特定句或是符號進行split
articles = " ".join([i.text for i in ele_soup.select("p")])

if "For further information" in articles:
    contents = articles.split(r"For further information")[0]
elif "SEC Complaint" in articles:
    contents = articles.split("SEC Complaint")[0]
else:
    contents = articles.split("\n")[0]

print(contents)


["LITIGATION RELEASE NO. 18322 / SEPTEMBER 4, 2003\n\nSEC BRINGS ENFORCEMENT ACTIONS AGAINST THREE INDIVIDUALS, GOLDMAN SACHS, AND MASSACHUSETTS FINANCIAL SERVICES COMPANY RELATED TO TRADING BASED ON NON-PUBLIC INFORMATION ABOUT THE TREASURY'S DECISION TO CEASE ISSUANCE OF THE 30-YEAR BOND \n\nSECURITIES AND EXCHANGE COMMISSION v. PETER J. DAVIS, JR., JOHN M. YOUNGDAHL and STEVEN E. NOTHERN (United States District Court for the Southern District of New York, Civil Action No. 03-CV6672(NRB))"]


IndexError: list index out of range

### 必要な情報を取る流れ：
- 適用年份：2017~2003（再往前需要使用string方法擷取！）
- 將匹配完成的組成list，並刪除不需要的元素
- 使用re進行擷取
- 問題：在取得內文時，不知為何也會一併取到之前的元素（號碼、標題等）
  * ■ 應對方法：抓取基本情報同時，使用extract()將資料萃取出來，如此一來後面內文就不會再抓到它們

<br>

- 待解決問題：
  * 抓取元素後得到一大串string -> Error
  * extra_info的地方，抓到不只一個元素時的解決方法

#### String方法 => 效率太差，不好使用，暫時以list為主

In [219]:
# 利用string
elements_str = "\n".join(elements)
print(elements_str)

Litigation Release No. 23747 / February 10, 2017
Securities and Exchange Commission v. Shaohua (Michael) Yin, et al., Civil Action No. 17-CV-972 (S.D.N.Y., filed February 10, 2017)
SEC Charges Chinese Citizens Who Reaped Massive Profits from Insider Trading on Comcast-DreamWorks Acquisition
Obtains Court Order Freezing More Than $29 Million in U.S. Accounts


In [140]:
# release號碼、日期
litigation_number, date = re.findall("^Litigation.*", elements_str, flags=re.M)[0].split(" / ")


In [141]:
# extra info
extra_info = re.findall("^SEC v\..*|^Securities and Exchange Commission v\..*", elements_str, flags=re.M) 
extra_info    

['SEC v. Magyar Telekom Plc. and Deutsche Telekom AG, Case No. 11 civ 9646 (S.D.N.Y.)',
 'SEC v. Straub, et al., Case No. 11 civ 9645 (S.D.N.Y.)']

In [None]:
# 標題
title = ""

In [124]:
test_str = """
Securities and Exchange Commission v. One or More Unknown Purchasers of,
SEC v. Magyar Telekom Plc. and Deutsche Telekom AG, Case No. 11 civ 9646 (S.D.N.Y.)
SEC v. Straub, et al., Case No. 11 civ 9645 (S.D.N.Y.)
"""

re.findall(r"^SEC v\..*|^Securities and Exchange Commission v\..*", test_str, flags=re.M) 

['Securities and Exchange Commission v. One or More Unknown Purchasers of,',
 'SEC v. Magyar Telekom Plc. and Deutsche Telekom AG, Case No. 11 civ 9646 (S.D.N.Y.)',
 'SEC v. Straub, et al., Case No. 11 civ 9645 (S.D.N.Y.)']

#### List方法 => 主要方法

In [548]:
#### 測試！！！！
url = "https://www.sec.gov/litigation/litreleases/2016/lr23711.htm" #"https://www.sec.gov/litigation/litreleases/2017/lr23994.htm"

headers = {"user-agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36"}
ele_res = requests.get(url, headers=headers)
ele_soup = BeautifulSoup(ele_res.text, features="html.parser")


In [549]:
# 取得基本情報 => 幾乎都存放在h開頭的標籤裡面，透過for loop進行蒐集tag
raw_data = []
for i in range(1, 4):

    data = ele_soup.select(f"h{i}")
    if data:
        for d in data:
            raw_data.append(d.extract())

# 從tag中取出文字
elements = list(map(lambda x:x.text, raw_data))
elements

[<h2>U.S. SECURITIES AND EXCHANGE COMMISSION</h2>, <h2>Litigation Release No. 23711 / December 27, 2016</h2>, <h2><i>Securities and Exchange Commission v. Iat Hong, et al.</i>, Civil Action No. 16 cv 9947 (S.D.N.Y., filed Dec. 27, 2016)</h2>, <h3>Chinese Traders Charged with Trading On Hacked Nonpublic Information Stolen from Two New York-Based Law Firms</h3>, <h3><i>Marks First Time SEC Charges Hacking into Law Firm Computer Networks</i></h3>]


['U.S. SECURITIES AND EXCHANGE COMMISSION',
 'Litigation Release No. 23711 / December 27, 2016',
 'Securities and Exchange Commission v. Iat Hong, et al., Civil Action No. 16 cv 9947 (S.D.N.Y., filed Dec. 27, 2016)',
 'Chinese Traders Charged with Trading On Hacked Nonpublic Information Stolen from Two New York-Based Law Firms',
 'Marks First Time SEC Charges Hacking into Law Firm Computer Networks']

In [550]:
# 利用list
elements_list = elements.copy()

In [551]:
elements_list

['U.S. SECURITIES AND EXCHANGE COMMISSION',
 'Litigation Release No. 23711 / December 27, 2016',
 'Securities and Exchange Commission v. Iat Hong, et al., Civil Action No. 16 cv 9947 (S.D.N.Y., filed Dec. 27, 2016)',
 'Chinese Traders Charged with Trading On Hacked Nonpublic Information Stolen from Two New York-Based Law Firms',
 'Marks First Time SEC Charges Hacking into Law Firm Computer Networks']

In [552]:
useless = [i for i in elements if re.match(".*Securities and Exchange Commission.*", i, flags=re.I)][0]
elements_list.remove(useless)


In [553]:
# 號碼、日期
result = [i for i in elements_list if re.match("^Litigation.*", i, flags=re.I)][0]
elements_list.remove(result)
number, date = result.split("/")
number, date

('Litigation Release No. 23711 ', ' December 27, 2016')

In [554]:
# extra info => 存在兩個sec v.的狀況並被視為不同元素時處理待補
result = [i for i in elements_list if re.match("^SEC v\..*|^Securities and Exchange Commission v\..*", i, flags=re.I)][0]
elements_list.remove(result)
extra_info = result
extra_info

'Securities and Exchange Commission v. Iat Hong, et al., Civil Action No. 16 cv 9947 (S.D.N.Y., filed Dec. 27, 2016)'

In [555]:
# title
title = " ".join(elements_list)
title

'Chinese Traders Charged with Trading On Hacked Nonpublic Information Stolen from Two New York-Based Law Firms Marks First Time SEC Charges Hacking into Law Firm Computer Networks'

In [557]:
# content 
articles = " ".join([i.text for i in ele_soup.select("p")])

if "For further information" in articles:
    contents = articles.split("For further information")[0]
elif "SEC Complaint" in articles:
    contents = articles.split("SEC Complaint")[0]
else:
    pass

print(contents)

The Securities and Exchange Commission today charged three Chinese traders with fraudulently trading on hacked nonpublic market moving information stolen from two prominent New York-based law firms, racking up almost $3 million in illegal profits.  The SEC is also seeking an asset freeze that prevents the traders from cashing in on their illicit gains.  Today's action marks the first time the SEC has charged hacking into a law firm's computer network. The SEC's complaint alleges that Iat Hong, Bo Zheng, and Hung Chin executed a deceptive scheme to hack into the networks of two law firms and steal confidential information pertaining to firm clients that were considering mergers or acquisitions. According to the SEC's complaint, the alleged hacking incidents involved installing malware on the law firms' networks, compromising accounts that enabled access to all email accounts at the firms, and copying and transmitting dozens of gigabytes of emails to remote internet locations.  Defendant