## Webページ内のデータを自動抽出
## （Beautiful Soup編）
- Webページの情報を取得
- Beautiful Soupで構造解析
- Beautiful Soupでデータ取得

In [None]:
!pip install requests

In [None]:
!pip install BeautifulSoup4

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url = 'https://scraping-for-beginner.herokuapp.com/udemy'
res = requests.get(url)

In [None]:
soup = BeautifulSoup(res.text,'html.parser')

In [None]:
print(soup.prettify())

### find_all():「要素名」に続いて「属性」を引数で指定

In [None]:
subscribers = soup.find_all('p',attrs={'class':'subscribers'})

subscribers[0].text

In [None]:
reviews = soup.find_all('p',attrs={'class':'reviews'})
reviews[0].text

### select(): 「CSSセレクタ」で指定

In [None]:
soup.select_one('.subscribers').text

In [None]:
soup.select_one('.reviews').text

## ランキングサイト掲載情報を自動取得
- Webページの情報を取得
- １つの観光地情報を取得
- すべての観光地情報を取得
- csvファイルに出力

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
url='https://scraping-for-beginner.herokuapp.com/ranking/'
res = requests.get(url)

In [None]:
soup = BeautifulSoup(res.text,'html.parser')

## １つの観光地情報を取得する

In [None]:
spots = soup.find_all('div',attrs={'class':'u_areaListRankingBox'})

In [None]:
spot = spots[0]
spot_name = spot.find('div',attrs={'class':'u_title'})

In [None]:
spot_name.find('span',attrs={'class':'badge'}).extract()

In [None]:
spot_name = spot_name.text.replace('\n','')
spot_name

In [None]:
spot_rank = spot.find('div',attrs={'class':'u_rankBox'})
eval_num = float(spot_rank.text.replace('\n',''))
eval_num

In [None]:
categoryItems = spot.find('div',attrs={'class':'u_categoryTipsItem'})

In [None]:
categoryItems = categoryItems.find_all('dl')
categoryItems

In [None]:
categoryItem = categoryItems[3]
print(categoryItem.dt.text)
print(categoryItem.span.text)

In [None]:
details = {}
for categoryItem in categoryItems:
    details[categoryItem.dt.text]=float(categoryItem.span.text)
details

In [None]:
datum = details
datum['観光地名']=spot_name
datum['評価'] = eval_num

In [None]:
datum

In [None]:
soup = BeautifulSoup(res.text,'html.parser')
spots = soup.find_all('div',attrs={'class':'u_areaListRankingBox'})
tourlistSpots=[]
for spot in spots:
    datum = {}
    spot_name = spot.find('div',attrs={'class':'u_title'})
    spot_name.find('span',attrs={'class':'badge'}).extract()
    spot_name = spot_name.text.replace('\n','')
    datum['観光地名']=spot_name
    spot_rank = spot.find('div',attrs={'class':'u_rankBox'})
    eval_num = float(spot_rank.text.replace('\n',''))
    datum['評価']=eval_num
    categoryItems = spot.find('div',attrs={'class':'u_categoryTipsItem'})
    categoryItems = categoryItems.find_all('dl')
    for categoryItem in categoryItems:
        datum[categoryItem.dt.text]=float(categoryItem.span.text)
    tourlistSpots.append(datum)
tourlistSpots

In [None]:
import pandas as pd

df = pd.DataFrame(tourlistSpots)
df

In [None]:
df.to_csv('観光地情報.csv',index=False)

## Webページ上の画像を自動で取得
- １枚の画像を取得＆保存
- 複数の画像を取得＆保存

In [101]:
from bs4 import BeautifulSoup
import requests

In [104]:
url = 'https://scraping-for-beginner.herokuapp.com/image'
res = requests.get(url)
res

<Response [200]>

In [108]:
soup = BeautifulSoup(res.text,'html.parser')

In [111]:
img_tag = soup.find('img')
img_tag

<img class="materialbox responsive-img card" src="/static/assets/img/img1.JPG"/>

In [112]:
img_tag['src']

'/static/assets/img/img1.JPG'

In [114]:
root_url = 'https://scraping-for-beginner.herokuapp.com'
img_url = root_url+img_tag['src']

In [115]:
from PIL import Image
import io

In [118]:
img = Image.open(io.BytesIO(requests.get(img_url).content))
img.save('img/sample.jpg')

In [124]:
from PIL import Image
import io

root_url = 'https://scraping-for-beginner.herokuapp.com'
soup = BeautifulSoup(res.text,'html.parser')
img_tags = soup.find_all('img')
for i,img_tag in enumerate(img_tags):
    img_url = root_url+img_tag['src']
    img = Image.open(io.BytesIO(requests.get(img_url).content))
    img.save(f'img/sample{i+1}.jpg')