# 9장. 데이터 수집하기 (Getting Data) - 웹 스크랩핑

In [1]:
!pip install beautifulsoup4 requests html5lib

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
Collecting html5lib
  Downloading html5lib-1.1-py2.py3-none-any.whl (112 kB)
Collecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Collecting webencodings
  Using cached webencodings-0.5.1-py2.py3-none-any.whl (11 kB)
Installing collected packages: webencodings, soupsieve, html5lib, beautifulsoup4
Successfully installed beautifulsoup4-4.9.3 html5lib-1.1 soupsieve-2.2.1 webencodings-0.5.1


## 1. HTML 문서 가져오기

In [2]:
from bs4 import BeautifulSoup
import requests

# I put the relevant HTML file on GitHub. In order to fit
# the URL in the book I had to split it across two lines.
# Recall that whitespace-separated strings get concatenated.
url = ("https://raw.githubusercontent.com/joelgrus/data/master/getting-data.html")
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
print(soup)

<!DOCTYPE html>
<html lang="en-US"><head>
    <title>Getting Data</title>
    <meta charset="utf-8"/>
</head>
<body>
    <h1>Getting Data</h1>
    <div class="explanation">
        This is an explanation.
    </div>
    <div class="comment">
        This is a comment.
    </div>
    <div class="content">
        <p id="p1">This is the first paragraph.</p>
        <p class="important">This is the second paragraph.</p>
    </div>
    <div class="signature">
        <span id="name">Joel</span>
        <span id="twitter">@joelgrus</span>
        <span id="email">joelgrus-at-gmail</span>
    </div>


</body></html>


### 1.1 첫번째 Paragraph

In [3]:
first_paragraph = soup.find('p')        # or just soup.p
print(first_paragraph)
assert str(soup.find('p')) == '<p id="p1">This is the first paragraph.</p>'

<p id="p1">This is the first paragraph.</p>


### 1.2 단어로 쪼개기

In [4]:
first_paragraph_text = soup.p.text
first_paragraph_words = soup.p.text.split()
print(first_paragraph_words)
assert first_paragraph_words == ['This', 'is', 'the', 'first', 'paragraph.']

['This', 'is', 'the', 'first', 'paragraph.']


### 1.3 딕셔너리처럼 사용하기

In [5]:
first_paragraph_id = soup.p['id']       # raises KeyError if no 'id'
first_paragraph_id2 = soup.p.get('id')  # returns None if no 'id'


assert first_paragraph_id == first_paragraph_id2 == 'p1'

### 1.4 여러 태그 불러오기

In [6]:
all_paragraphs = soup.find_all('p')  # or just soup('p')
paragraphs_with_ids = [p for p in soup('p') if p.get('id')]

assert len(all_paragraphs) == 2
assert len(paragraphs_with_ids) == 1

### 1.5  특정 클래스 태그 불러오기

In [7]:
important_paragraphs = soup('p', {'class' : 'important'})
important_paragraphs2 = soup('p', 'important')
important_paragraphs3 = [p for p in soup('p')
                         if 'important' in p.get('class', [])]

assert important_paragraphs == important_paragraphs2 == important_paragraphs3
assert len(important_paragraphs) == 1

### 1.6  div에 포함된 span가져오기

In [8]:
spans_inside_divs = [span
                     for div in soup('div')     # for each <div> on the page
                     for span in div('span')]   # find each <span> inside it


assert len(spans_inside_divs) == 3

## 2. 예시 : 의회 감시하기

In [9]:
from bs4 import BeautifulSoup
import requests

url = "https://www.house.gov/representatives"
text = requests.get(url).text
soup = BeautifulSoup(text, "html5lib")

all_urls = [a['href']
            for a in soup('a')
            if a.has_attr('href')]

print(len(all_urls))  # 965 for me, way too many

967


### 2.1 정규식을 이용해서 의원의 URL만 필터링

In [10]:
import re

# Must start with http:// or https://
# Must end with .house.gov or .house.gov/
regex = r"^https?://.*\.house\.gov/?$"

# Let's write some tests!
assert re.match(regex, "http://joel.house.gov")
assert re.match(regex, "https://joel.house.gov")
assert re.match(regex, "http://joel.house.gov/")
assert re.match(regex, "https://joel.house.gov/")
assert not re.match(regex, "joel.house.gov")
assert not re.match(regex, "http://joel.house.com")
assert not re.match(regex, "https://joel.house.gov/biography")

In [11]:
good_urls = [url for url in all_urls if re.match(regex, url)]

print(len(good_urls))  # still 862 for me

870


### 2.2 중복 제거

In [12]:
num_original_good_urls = len(good_urls)
good_urls = list(set(good_urls))
    
print(len(good_urls))  # only 431 for me
assert len(good_urls) < num_original_good_urls

435


### 2.3  jayapal 홈페이지에서 PR 링크 찾기

In [13]:
html = requests.get('https://jayapal.house.gov').text
soup = BeautifulSoup(html, 'html5lib')

# Use a set because the links might appear multiple times.
links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}

print(links) # {'/media/press-releases'}

{'https://jayapal.house.gov/category/news/', 'https://jayapal.house.gov/category/press-releases/'}


### 2.4  모든 의원의 홈페이지에서 PR 링크 찾기

In [14]:
from typing import Dict, Set

press_releases: Dict[str, Set[str]] = {}

for house_url in good_urls:
    html = requests.get(house_url).text
    soup = BeautifulSoup(html, 'html5lib')
    pr_links = {a['href'] for a in soup('a') if 'press releases' in a.text.lower()}
    print(f"{house_url}: {pr_links}")
    press_releases[house_url] = pr_links

https://kilmer.house.gov: {'https://kilmer.house.gov/news/press-releases'}
https://crenshaw.house.gov/: {'/press-releases'}
https://arrington.house.gov: set()
https://eshoo.house.gov/: {'/media/press-releases'}
https://biggs.house.gov: {'/media/press-releases'}
https://loudermilk.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://morelle.house.gov: {'/media/press-releases'}
https://sarajacobs.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://cartwright.house.gov: {'/news/documentquery.aspx?DocumentTypeID=2442'}
https://comer.house.gov/: {'/press-release'}
https://panetta.house.gov: {'/media/press-releases'}
https://trahan.house.gov: set()
https://budd.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://mccollum.house.gov: {'/media/press-releases'}
https://napolitano.house.gov/: {'/media/press-releases'}
https://franklin.house.gov: {'/media/press-releases'}
https://stevens.house.gov/: {'/media/press-releases'}
https://cline.house.gov: {'/medi

https://sablan.house.gov/: set()
https://gibbs.house.gov/: {'/media-center/press-releases'}
https://mast.house.gov: {'/press-releases'}
https://moulton.house.gov/: set()
https://castro.house.gov: {'https://castro.house.gov/media-center/press-releases'}
https://schakowsky.house.gov: {'/media/press-releases'}
https://delgado.house.gov: {'/media/press-releases'}
https://bluntrochester.house.gov: set()
https://kustoff.house.gov: {'/media/press-releases'}
https://johnjoyce.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://pascrell.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://stanton.house.gov/: {'/media/press-releases'}
https://harshbarger.house.gov: {'/media/press-releases'}
https://bush.house.gov: {'/media/press-releases'}
https://scalise.house.gov/: {'/media/press-releases'}
https://larsen.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://hinson.house.gov: {'/media/press-releases'}
https://dean.house.gov: {'/press-releases'}
https://

https://keller.house.gov: {'/media/press-releases'}
https://carter.house.gov/: {'/news/press-releases'}
https://langevin.house.gov: {'/press-releases'}
https://scanlon.house.gov/: set()
https://escobar.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://schweikert.house.gov/: {'/media-center/press-releases'}
https://demings.house.gov: {'/media/press-releases'}
https://takano.house.gov: {'https://takano.house.gov/newsroom/press-releases'}
https://zeldin.house.gov/: {'/media-center/press-releases'}
https://bilirakis.house.gov/: {'/media/press-releases'}
https://rutherford.house.gov: {'/media/press-releases'}
https://meijer.house.gov: {'/media/press-releases'}
https://donyoung.house.gov/: {'/News/'}
https://gonzales.house.gov: {'/media/press-releases'}
https://gooden.house.gov: {'/media/press-releases'}
https://thompson.house.gov: {'/media-center/press-releases'}
https://smucker.house.gov/: {'/media/press-releases'}
https://billjohnson.house.gov/: {'/News/DocumentQuery.aspx?

https://massie.house.gov: set()
https://soto.house.gov: {'/media/press-releases'}
https://mullin.house.gov: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://spartz.house.gov: {'/media/press-releases'}
https://kaptur.house.gov/: {'/media-center/press-releases'}
https://kirkpatrick.house.gov/: set()
https://mcbath.house.gov: {'/press-releases'}
https://cammack.house.gov: {'/media/press-releases'}
https://gallagher.house.gov: {'/media/press-releases'}
https://hill.house.gov/: {'/news/documentquery.aspx?DocumentTypeID=27'}
https://jacobs.house.gov: {'/media/press-releases'}
https://hayes.house.gov: {'/media/press-releases'}
https://guest.house.gov: {'/media/press-releases'}
https://joyce.house.gov: {'/press-releases'}
https://bentz.house.gov: {'/media/press-releases'}
https://lawrence.house.gov/: {'/media-center/press-releases'}
https://welch.house.gov/: {'/media-center/press-releases'}
https://pappas.house.gov: {'/media/press-releases'}
https://vargas.house.gov: {'/media-center/pres

### 2.5  Paragraph에 keyword가 존재하는지 확인

In [15]:
def paragraph_mentions(text: str, keyword: str) -> bool:
    """
    Returns True if a <p> inside the text mentions {keyword}
    """
    soup = BeautifulSoup(text, 'html5lib')
    paragraphs = [p.get_text() for p in soup('p')]

    return any(keyword.lower() in paragraph.lower()
               for paragraph in paragraphs)

In [16]:
text = """<body><h1>Facebook</h1><p>Twitter</p>"""
assert paragraph_mentions(text, "twitter")       # is inside a <p>
assert not paragraph_mentions(text, "facebook")  # not inside a <p>

### 2.6  어떤 의원의 보도자료에 data를 언급했는지 찾기

In [17]:
for house_url, pr_links in press_releases.items():
    for pr_link in pr_links:
        url = f"{house_url}/{pr_link}"
        text = requests.get(url).text

        if paragraph_mentions(text, 'data'):
            print(f"{house_url}")
            break  # done with this house_url

https://panetta.house.gov
https://banks.house.gov
https://reed.house.gov/
https://gibbs.house.gov/
https://johnjoyce.house.gov/
https://bush.house.gov
https://allred.house.gov/
https://lieu.house.gov/
https://davidscott.house.gov/
https://mcclain.house.gov
https://higgins.house.gov
https://donyoung.house.gov/
https://anthonygonzalez.house.gov
https://gwenmoore.house.gov
https://kelly.house.gov
https://gomez.house.gov/
https://ohalleran.house.gov
https://boebert.house.gov


## 3. GitHub API 사용하기

In [18]:
!pip install python-dateutil



### 3.1 JSON 객체 파싱

In [19]:
import requests, json

github_user = "joelgrus"
endpoint = f"https://api.github.com/users/{github_user}/repos"

repos = json.loads(requests.get(endpoint).text)

### 3.2 월별/요일별 저장소 생성 통계

In [20]:
from collections import Counter
from dateutil.parser import parse

dates = [parse(repo["created_at"]) for repo in repos]
month_counts = Counter(date.month for date in dates)
weekday_counts = Counter(date.weekday() for date in dates)

### 3.3 가장 최근에 만들어진 저장소 5개에 사용된 언어

In [21]:
last_5_repositories = sorted(repos,
                                key=lambda r: r["pushed_at"],
                                reverse=True)[:5]

last_5_languages = [repo["language"]
                    for repo in last_5_repositories]

In [22]:
print(last_5_repositories)

[{'id': 26382146, 'node_id': 'MDEwOlJlcG9zaXRvcnkyNjM4MjE0Ng==', 'name': 'data-science-from-scratch', 'full_name': 'joelgrus/data-science-from-scratch', 'private': False, 'owner': {'login': 'joelgrus', 'id': 1308313, 'node_id': 'MDQ6VXNlcjEzMDgzMTM=', 'avatar_url': 'https://avatars.githubusercontent.com/u/1308313?v=4', 'gravatar_id': '', 'url': 'https://api.github.com/users/joelgrus', 'html_url': 'https://github.com/joelgrus', 'followers_url': 'https://api.github.com/users/joelgrus/followers', 'following_url': 'https://api.github.com/users/joelgrus/following{/other_user}', 'gists_url': 'https://api.github.com/users/joelgrus/gists{/gist_id}', 'starred_url': 'https://api.github.com/users/joelgrus/starred{/owner}{/repo}', 'subscriptions_url': 'https://api.github.com/users/joelgrus/subscriptions', 'organizations_url': 'https://api.github.com/users/joelgrus/orgs', 'repos_url': 'https://api.github.com/users/joelgrus/repos', 'events_url': 'https://api.github.com/users/joelgrus/events{/privacy

In [23]:
print(last_5_languages)

['Python', 'JavaScript', 'Python', 'Python', 'Python']


## 4. 트위터 API 사용하기

In [24]:
!pip install twython

Collecting twython
  Using cached twython-3.8.2-py3-none-any.whl (33 kB)
Collecting requests-oauthlib>=0.4.0
  Using cached requests_oauthlib-1.3.0-py2.py3-none-any.whl (23 kB)
Collecting oauthlib>=3.0.0
  Using cached oauthlib-3.1.0-py2.py3-none-any.whl (147 kB)
Installing collected packages: oauthlib, requests-oauthlib, twython
Successfully installed oauthlib-3.1.0 requests-oauthlib-1.3.0 twython-3.8.2


### 4.1 API Key와 Secret Key

In [25]:
import os

# Feel free to plug your key and secret in directly
CONSUMER_KEY = os.environ.get("TWITTER_CONSUMER_KEY")
CONSUMER_SECRET = os.environ.get("TWITTER_CONSUMER_SECRET")

### 4.2 클라이언트 인스턴스 만들기

In [26]:
import webbrowser
from twython import Twython

# Get a temporary client to retrieve an authentication url
temp_client = Twython(CONSUMER_KEY, CONSUMER_SECRET)
temp_creds = temp_client.get_authentication_tokens()
url = temp_creds['auth_url']

# Now visit that URL to authorize the application and get a PIN
print(f"go visit {url} and get the PIN code and paste it below")
webbrowser.open(url)
PIN_CODE = input("please enter the PIN code: ")

# Now we use that PIN_CODE to get the actual tokens
auth_client = Twython(CONSUMER_KEY,
                        CONSUMER_SECRET,
                        temp_creds['oauth_token'],
                        temp_creds['oauth_token_secret'])
final_step = auth_client.get_authorized_tokens(PIN_CODE)
ACCESS_TOKEN = final_step['oauth_token']
ACCESS_TOKEN_SECRET = final_step['oauth_token_secret']

# And get a new Twython instance using them.
twitter = Twython(CONSUMER_KEY, CONSUMER_SECRET, ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

go visit https://api.twitter.com/oauth/authenticate?oauth_token=387PgAAAAAABNC5QAAABecK_9Do and get the PIN code and paste it below
please enter the PIN code: 6600070


### 4.3 몇몇 트윗 받기

In [27]:
for status in twitter.search(q='"data science"')["statuses"]:
    user = status["user"]["screen_name"]
    text = status["text"]
    print(f"{user}: {text}\n")

Topschoolworkh1: Term papers and classes help

Math
Accounting
Microeconomics
Business
CALC,,
Political science
Health
Thesis,,
Law… https://t.co/1dkXfHZjuv

Topschoolworkh1: RT @Topschoolworkh1: Term papers and classes help

Math
Accounting
Microeconomics
Business
CALC,,
Political science
Health
Thesis
Sociology…

Christo48746457: RT @freeCodeCamp: If you want to practice your machine learning skills, try building an end-to-end ML project.

It's especially fun with a…

BotForEquality: RT @OsoroSatComs: Kindly dm if you’re/know any #lady who can tutor data science/machine learning in Nairobi sometimes in late June/ Early J…

OsoroSatComs: Kindly dm if you’re/know any #lady who can tutor data science/machine learning in Nairobi sometimes in late June/ E… https://t.co/3dHC0QEtnj

Deep__AI: Level up your data science vocabulary: Beta Distribution https://t.co/LGNDdWdZRv #Probability #BetaDistribution

LatinoLdnOnt: Data Science Is a High-Paying, Fast-Growing Field—Here’s What You Need to 

### 4.4 스트림으로 대량의 트윗 받기

In [28]:
from twython import TwythonStreamer

# Appending data to a global variable is pretty poor form
# but it makes the example much simpler
tweets = []

class MyStreamer(TwythonStreamer):
    def on_success(self, data):
        """
        What do we do when twitter sends us data?
        Here data will be a Python dict representing a tweet
        """
        # We only want to collect English-language tweets
        if data.get('lang') == 'en':
            tweets.append(data)
            print(f"received tweet #{len(tweets)}")

        # Stop when we've collected enough
        if len(tweets) >= 100:
            self.disconnect()

    def on_error(self, status_code, data):
        print(status_code, data)
        self.disconnect()

### 4.5 Data가 포함된 트윗 다운로드

In [29]:
stream = MyStreamer(CONSUMER_KEY, CONSUMER_SECRET,
                    ACCESS_TOKEN, ACCESS_TOKEN_SECRET)

# starts consuming public statuses that contain the keyword 'data'
stream.statuses.filter(track='data')

# if instead we wanted to start consuming a sample of *all* public statuses
# stream.statuses.sample()

received tweet #1
received tweet #2
received tweet #3
received tweet #4
received tweet #5
received tweet #6
received tweet #7
received tweet #8
received tweet #9
received tweet #10
received tweet #11
received tweet #12
received tweet #13
received tweet #14
received tweet #15
received tweet #16
received tweet #17
received tweet #18
received tweet #19
received tweet #20
received tweet #21
received tweet #22
received tweet #23
received tweet #24
received tweet #25
received tweet #26
received tweet #27
received tweet #28
received tweet #29
received tweet #30
received tweet #31
received tweet #32
received tweet #33
received tweet #34
received tweet #35
received tweet #36
received tweet #37
received tweet #38
received tweet #39
received tweet #40
received tweet #41
received tweet #42
received tweet #43
received tweet #44
received tweet #45
received tweet #46
received tweet #47
received tweet #48
received tweet #49
received tweet #50
received tweet #51
received tweet #52
received tweet #53
re

In [30]:
print(tweets[0])

{'created_at': 'Mon May 31 14:07:19 +0000 2021', 'id': 1399366886102536193, 'id_str': '1399366886102536193', 'text': '@EmperorBTC 7 years working on projects related to startups, specializing in competitor analysis, data management,… https://t.co/4wjmTJKhg9', 'display_text_range': [12, 140], 'source': '<a href="https://mobile.twitter.com" rel="nofollow">Twitter Web App</a>', 'truncated': True, 'in_reply_to_status_id': 1399365824641568768, 'in_reply_to_status_id_str': '1399365824641568768', 'in_reply_to_user_id': 183951857, 'in_reply_to_user_id_str': '183951857', 'in_reply_to_screen_name': 'EmperorBTC', 'user': {'id': 1325623166, 'id_str': '1325623166', 'name': 'Fox', 'screen_name': '9Jzs', 'location': 'Barcelona', 'url': None, 'description': None, 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 125, 'friends_count': 344, 'listed_count': 1, 'favourites_count': 6871, 'statuses_count': 8379, 'created_at': 'Wed Apr 03 22:59:04 +0000 2013', 'utc_offset':

### 4.6 가장 많이 나오는 해시테그 찾기

In [31]:
from collections import Counter

top_hashtags = Counter(hashtag['text'].lower()
                        for tweet in tweets
                        for hashtag in tweet['entities']['hashtags'])
print(top_hashtags.most_common(5))

[('ai', 8), ('besmartlikebts', 4), ('judicialdeeppockets', 1), ('pot', 1), ('fbo', 1)]
