#### BeautifulSoup 객체 함수들
|함수|의미|
|---|----|
|soup.prettify() | 들여쓰기 상태 보기
|list(soup.children) | 한 단계 아래 더 보기
|list(soup.children[2]) | 몇 단계 아래 더 보기
|soup.body | body 태그 보기
|soup.find_all('p') | 모든 p 태그 찾기
|soup.find_all('p', class_='outer-text') | p 태그 중 바깥 것
|soup.head.next_sibling | head 같은 위치 밑 첫 자식

In [77]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

In [78]:
url = 'https://finance.naver.com/marketindex/'
page = urlopen(url)
soup = BeautifulSoup(page, 'html.parser')

In [79]:
soup.find_all('span', 'value')[0].string

'1,230.50'

---

In [80]:
from urllib.request import Request
import ssl

context = ssl._create_unverified_context()


url_base = "https://www.chicagomag.com"
url_sub = "/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub

req = Request(url, headers={'User-Agent': 'Mozilla/5.0'})
html = urlopen(req, context=context)
soup = BeautifulSoup(html, "html.parser")

In [81]:
len(soup.find_all('div', 'sammy'))

50

In [82]:
print(soup.find_all('div', 'sammy')[0])

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>


In [83]:
type(soup.find_all('div', 'sammy')[0])

bs4.element.Tag

In [84]:
soup.find_all('div', 'sammy')[0].find(class_='sammyRank')

<div class="sammyRank">1</div>

In [85]:
soup.find_all('div', 'sammy')[0].find(class_='sammyRank').get_text()

'1'

In [86]:
tmp_one = soup.find_all('div', 'sammy')[0]
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [87]:
import re

tmp_string = tmp_one.find(class_='sammyListing').get_text()

print(re.split(('\n|\r\n'), tmp_string))

['BLT', 'Old Oak Tap', 'Read more ']


In [88]:
import urllib.parse
rank, main_menu, cafe_name, url_add = [], [], [], []

list_soup = soup.find_all('div', 'sammy')

for item in list_soup:
    rank.append(item.find(class_='sammyRank').get_text())
    
    tmp_string = item.find(class_='sammyListing').get_text()
    
    main_menu.append(re.split(('\n|\r\n'), tmp_string)[0])
    cafe_name.append(re.split(('\n|\r\n'), tmp_string)[1])
    
    url_add.append(urllib.parse.urljoin(url_base, item.find('a')['href']))

In [89]:
main_menu[:5]

['BLT', 'Fried Bologna', 'Woodland Mushroom', 'Roast Beef', 'PB&L']

In [90]:
import pandas as pd
data = {'Rank':rank, 'Cafe':cafe_name, 'Menu':main_menu, 'URL':url_add}
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [91]:
df['URL'][0]

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

### headers 로 크롬 브라우저에서 요청한 것으로 인식
### contexg로 http문제 해결

In [92]:
import ssl

headers = {'User-Agent': 'Chrome/66.0.3359.181'}
req = urllib.request.Request(df['URL'][0], headers=headers)
context = ssl._create_unverified_context()
html = urlopen(req, context=context)
soup_tmp = BeautifulSoup(html.read(), 'html.parser')

In [93]:
price_tmp = soup_tmp.find('p', 'addy').get_text()
price_tmp

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [94]:
price_tmp.split()[0][:-1]

'$10'

In [95]:
' '.join(price_tmp.split()[1:-2])

'2109 W. Chicago Ave.,'

In [96]:
from tqdm import tqdm_notebook
price, address = [], []

for n in tqdm_notebook(df.index):
    req = urllib.request.Request(df['URL'][n], headers=headers)
    html = urlopen(req, context=context)
    soup_tmp = BeautifulSoup(html, 'lxml')
    
    gettings = soup_tmp.find('p', 'addy').get_text()
    
    price.append(gettings.split()[0][:-1])
    address.append(' '.join(gettings.split()[1:-2]))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(df.index):


  0%|          | 0/50 [00:00<?, ?it/s]

In [97]:
conda install -c conda-forge tqdm

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [98]:
len(address)

50

In [99]:
df['Price'] = price
df['Address'] = address

df = df.loc[:, ['Rank', 'Cafe', 'Menu', 'Price', 'Address']]
df.set_index('Rank', inplace=True)
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46,Chickpea,Kufta,$8,"2018 W. Chicago Ave.,"
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,"25 E. Delaware Pl.,"
48,Zenwich,Beef Curry,$7.50,"416 N. York St., Elmhurst,"
49,Toni Patisserie,Le Végétarien,$8.75,"65 E. Washington St.,"
50,Phoebe’s Bakery,The Gatsby,$6.85,"3351 N. Broadway,"


In [100]:
import folium, googlemaps
import numpy as np

In [101]:
gmaps = googlemaps.Client(key='---')

In [102]:
lat, lng = [], []

for n in tqdm_notebook(df.index):
    if df['Address'][n] != 'Multiple':
        target_name = df['Address'][n] + ', ' + 'Cicago'
        gmaps_output = gmaps.geocode(target_name)
        location_output = gmaps_output[0].get('geometry')
        lat.append(location_output['location']['lat'])
        lng.append(location_output['location']['lng'])
    else:
        lat.append(np.nan)
        lng.append(np.nan)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for n in tqdm_notebook(df.index):


  0%|          | 0/50 [00:00<?, ?it/s]

In [103]:
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
46,Chickpea,Kufta,$8,"2018 W. Chicago Ave.,"
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,"25 E. Delaware Pl.,"
48,Zenwich,Beef Curry,$7.50,"416 N. York St., Elmhurst,"
49,Toni Patisserie,Le Végétarien,$8.75,"65 E. Washington St.,"
50,Phoebe’s Bakery,The Gatsby,$6.85,"3351 N. Broadway,"


In [104]:
df['lat'], df['lng'] = lat, lng

mapping = folium.Map(location=[df['lat'].mean(), df['lng'].mean()], zoom_start=11)
folium.Marker([df['lat'].mean(), df['lng'].mean()],
             popup='center').add_to(mapping)
mapping

In [105]:
for n in df.index:
    if df['Address'][n] != 'Multiple':
        folium.Marker([df['lat'][n], df['lng'][n]],
                     popup=df['Cafe'][n]).add_to(mapping)

mapping

---

In [106]:
url_base = 'http://movie.naver.com/'
url_syb = 'movie/sdb/rank/rmovie.nhn?sel=cur&date=20220301'

page = urlopen(url_base+url_syb)

soup = BeautifulSoup(page, 'html.parser')

In [107]:
soup.find_all('div', 'tit5')[0]

<div class="tit5">
<a href="/movie/bi/mi/basic.naver?code=213624" title="전투왕">전투왕</a>
</div>