# Web Scraping with Python
> GDSC NTNU<br>
> 講者：Hugo

In [None]:
import requests as rq
from bs4 import BeautifulSoup as soup
from selenium import webdriver
import time

## Requests
### 發送 GET 請求

In [None]:
r = rq.get("https://www.google.com")
print(r.status_code) #content #status_code

In [None]:
r = rq.get("https://www.google.com")
if r.status_code == rq.codes.ok:
    print(r.text)

In [None]:
payload = {'key1': 'value1', 'key2': 'value2'}
html = rq.get("http://httpbin.org/get", params=payload)
print(html.text)

### 發送 POST 請求

In [None]:
payload = {'key1': 'value1', 'key2': 'value2'}
r = rq.post("http://httpbin.org/post", data=payload)
print(r.text)

### Lab01: 刷點閱次數 OuO

In [None]:
url = 'https://www.aa.ntnu.edu.tw/zh_tw/News/%E6%AD%A1%E8%BF%8E%E5%8F%83%E5%8A%A011%E6%9C%8816%E6%97%A5%E8%87%AA%E4%B8%BB%E5%AD%B8%E7%BF%92%E5%B0%88%E9%A1%8C%E6%8E%A2%E7%A9%B6%E7%B6%93%E9%A9%97%E5%88%86%E4%BA%AB%E6%9C%83-81001976'

times = int(input('我要新增點閱次數？次'))

for i in range(times):
    rq.get(url)
    time.sleep(2)
print('Done!')

In [None]:
url = "https://www.w3schools.com/images/w3lynx_200.png"
path = "./Ch05/fchart05.png"
response = rq.get(url, stream=True)
if response.status_code == 200:
    with open(path, 'wb') as fp:
        for chunk in response:
            fp.write(chunk)
    print("圖檔已經下載")        
else:
    print("錯誤! HTTP請求失敗...")

## Beautiful Soup

常用屬性

In [None]:
url = 'https://www.nationalgeographic.com/premium/article/remove-carbon-emissions'
html = rq.get(url)
html.encoding = 'UTF-8'
sp = soup(html.text, 'lxml')

print(sp.title)
print(sp.title.text)
print(sp.h1)
print(sp.p)

`find()` & `find_all()`

In [None]:
html = '''
<html>
    <head>
        <meta charset="UTF-8">
        <title> Website title </title>
    </head>
    <body>
        <p id="p1"> Section 1 </p>
        <p id="p2" class='red'> Section 2 </p>
    </body>
</html>
'''
sp = soup(html, 'lxml')
print(sp.find('p'))
print(sp.find_all('p'))
print(sp.find('p', {'id':'p2', 'class':'red'}))
print(sp.find('p', id='p2', class_= 'red'))

`select()`

In [None]:
html = '''
<html>
    <head>
        <title> Website title </title>
    </head>
    <body>
        <p id="p1"> Section 1 </p>
        <p id="p2" class='red'> Section 2 </p>
    </body>
</html>
'''
sp = soup(html, 'lxml')
print(sp.select('title'))
print(sp.select('p'))
print(sp.select('#p2')) #快速取 id
print(sp.select('.red')) #快速取 class

In [None]:
html = '''
<html>
    <head>
        <title> Website title </title>
    </head>
    <body>
        <img src="https://upload.wikimedia.org/wikipedia/zh/thumb/c/c3/National_Taiwan_Normal_University_logo.svg/300px-National_Taiwan_Normal_University_logo.svg.png">
        <a href="https://www.ntnu.edu.tw/"> NTNU(click me) </a>
    </body>
</html>
'''
sp = soup(html, 'lxml')
print(sp.select('img')[0].get('src'))
print(sp.select('a')[0].get('href'))
print(sp.select('img')[0]['src'])
print(sp.select('a')[0]['href'])

In [None]:
html = """
<html>
    <head>
        <title> Website title </title>
    </head>
    <h1> h1 title </h1>
    <div class="content">
    <div class="item1">
        <a href="http://example.com/one" class="red" id="link1"> First </a>
        <a href="http://example.com/two" class="red" id="link2"> Second </a>
    </div>
    <a href="http://example.com/three" class="blue" id="link3">
        <img src="http://example.com/three.jpg">Third
    </a>
</div>
"""

sp = soup(html,'lxml') 

print(sp.title)

print(sp.find('h1'))

print(sp.find_all('a')) 
print(sp.find_all("a", {"class":"red"}))

data1=sp.find("a", {"href":"http://example.com/one"})
print(data1.text) # First

data2 = sp.select("#link1") 
print(data2[0].text)
print(data2[0].get("href"))
print(data2[0]["href"])

print(sp.find_all(['title','h1']))

print(sp.select('div img')[0]['src'])

print(sp.select('div.item1 > a'))

### Lab02: 獲取新聞文本

In [None]:
url = 'https://news.pts.org.tw/curation/120'

html = rq.get(url)
sp = soup(html.text, 'lxml')

title = sp.find('h1', class_='text-center').text
print('# 標題\n', title)

summary = sp.find('div', class_='topic-summery position-relative').find('p').text
print('\n## 引言\n', summary)

subtitle = sp.find_all('h2', class_='title-col')
print('\n## 副標題')
for i in subtitle:
    print(i.text)

content = sp.find_all('p')
print('\n## 內文')
for j in content:
    print(j.text)

## Selenium

### Lab03: 爬取 MLB.com stats

In [None]:
# mlb_stats_crawler.ipynb

## It's your turn🫵
### 如何快速存取表格資料

In [None]:
import pandas as pd
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

chrome_options = Options()
chrome_options.add_argument('--headless')

url = 'https://www.aa.ntnu.edu.tw/zh_tw/News?category%5B%5D=6331b9713817848ea26d27b5&category%5B%5D=6331b9713817848ea26d27b6&category%5B%5D=6331b9713817848ea26d27b7&category%5B%5D=6331b9713817848ea26d27b8&category%5B%5D=6331ed7b381784be12bdd622&tags%5B%5D=all'
driver = webdriver.Chrome('/programing/swiftx/chromedriver-win64/chromedriver.exe', options=chrome_options)
html = driver.get(url)
sp = soup(driver.page_source, 'lxml')

table = driver.find_element(By.XPATH, '//*[@id="DataTables_Table_0"]').get_attribute('outerHTML')
df = pd.read_html(str(table))
df[0].to_csv("bulletin board.csv", index=False)

### 爬取 NBA.com stats

In [None]:
# your code