# 시카고 맛집 데이터 분석

- https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/ 

In [215]:
# 모듈 임포트

import re  
import pandas as pd 
import numpy as np
import googlemaps
import folium

from bs4 import BeautifulSoup
from urllib.request import Request, urlopen 
from fake_useragent import UserAgent
from urllib.parse import urljoin 

from tqdm import tqdm 

---

## 1. 웹 스크래핑

### 1.1 url 분석

In [31]:
# url
url_base = "https://www.chicagomag.com/"
url_sub = "Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/"
url = url_base + url_sub  
url

'https://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-Chicago/'

### 1.2 HTML 접근
- HTTP Response 얻기 : urlopen(URL) or request.get(URL).content   
- HTML 소스 얻기 : BeautifulSoup (HTTP Response, 'html.parser')

In [33]:
# 403에러 (서버에서 봇으로 인식하고 차단)
response = urlopen(url)
response

HTTPError: HTTP Error 403: Forbidden

In [35]:
# HTTP Error 403: Forbidden 세가지 해결방법

# 첫번째 방법
# req = Request(url, headers={"user-agent": "Chrome"})
# res = urlopen(req)
# html = BeautifulSoup(res, "html.parser")
# print(html.prettify())                                                 

# 두번째 방법
# req = Request(url, headers={"user-agent": 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.5005.61 Safari/537.36'})
# res = urlopen(req)
# html = BeautifulSoup(res, "html.parser")
# print(html.prettify())

# 세번째 방법
ua = UserAgent()
req = Request(url, headers={"user-agent": ua.ie})
res = urlopen(req)
html = BeautifulSoup(res, "html.parser")
print(html.prettify())

<!DOCTYPE html>
<html lang="en-US">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible">
   <link href="https://gmpg.org/xfn/11" rel="profile"/>
   <title>
    The 50 Best Sandwiches in Chicago – Chicago Magazine
   </title>
   <style type="text/css">
    .heateor_sss_button_instagram span.heateor_sss_svg,a.heateor_sss_instagram span.heateor_sss_svg{background:radial-gradient(circle at 30% 107%,#fdf497 0,#fdf497 5%,#fd5949 45%,#d6249f 60%,#285aeb 90%)}
						div.heateor_sss_horizontal_sharing a.heateor_sss_button_instagram span{background:#000!important;}div.heateor_sss_standard_follow_icons_container a.heateor_sss_button_instagram span{background:#000;}
										.heateor_sss_horizontal_sharing .heateor_sss_svg,.heateor_sss_standard_follow_icons_container .heateor_sss_svg{
							background-color: #000!important;
				background: #000!important;
							color: #fff;
						border-width: 0px;
			border-style: solid;
			border-color: transparent

### 1.3 HTML Tag
- HTML Tag 꺼내기 : find or find_all / select_one or select
- Tag에서 꺼내기 : Tag.get_text() or Tag.attrs

In [76]:
# 1 : html.find_all('Tag이름',{'Attr이름':'Attr 값'})
# 2 : html.find_all('Tag이름', Attr이름_='Attr값')
# 3 : html.find_all('Tag이름','Attr값')
# 4 : html.find_all(Attr이름_='Attr값')
# 5 : html.select(".sammy")

# 1. html.find_all("div", {'class':"sammy"}), len(html.find_all("div", {"class":"sammy"}))
# 2. html.find_all("div", class_="sammy"), len(html.find_all("div", class_="sammy"))
# 3. html.find_all("div", "sammy"), len(html.find_all("div", "sammy"))
# 4. html.find_all(class_="sammy"), len(html.find_all(class_="sammy"))
# 5. html.select('.sammy'), len(html.select('.sammy'))

html.find_all("div", "sammy"), len(html.find_all("div", "sammy"))

([<div class="sammy" style="position: relative;">
  <div class="sammyRank">1</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
  Old Oak Tap<br/>
  <em>Read more</em> </a></div>
  </div>,
  <div class="sammy" style="position: relative;">
  <div class="sammyRank">2</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Au-Cheval-Fried-Bologna/"><b>Fried Bologna</b><br/>
  Au Cheval<br/>
  <em>Read more</em> </a></div>
  </div>,
  <div class="sammy" style="position: relative;">
  <div class="sammyRank">3</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Xoco-Woodland-Mushroom/"><b>Woodland Mushroom</b><br/>
  Xoco<br/>
  <em>Read more</em> </a></div>
  </div>,
  <div class="sammy" style="position: relative;">
  <div class="sammyRank">4</div>
  <div class="sammyListing"><a href="/Chicago-Magazine/November-2

In [77]:
# 위의 50개의 ResultSet(리스트와 유사) 중 하나 테스트
tmp_one = soup.find_all("div", "sammy")[0]
tmp_one

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [78]:
type(tmp_one)

bs4.element.Tag

In [79]:
# 1.Tag.text
# 2.Tag.string
# 3.Tag.get_text() 

tmp_one = soup.find_all("div", "sammy")[0]
tmp_one.text

'\n1\nBLT\nOld Oak Tap\nRead more \n'

In [85]:
tmp_one

<div class="sammy" style="position: relative;">
<div class="sammyRank">1</div>
<div class="sammyListing"><a href="/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/"><b>BLT</b><br/>
Old Oak Tap<br/>
<em>Read more</em> </a></div>
</div>

In [89]:
print(tmp.find("div", {"class":"sammyRank"}).get_text()) # 랭크
print(tmp.find("div", {"class":"sammyListing"}).get_text())

1
BLT
Old Oak Tap
Read more 


In [101]:
tmp_one.find("div", {"class":"sammyListing"}).get_text()

'BLT\nOld Oak Tap\nRead more '

In [104]:
import re 

tmp_string = tmp_one.find(class_="sammyListing").get_text()

# \n 또는 \r\n 기준으로 나누어서 리스트로 만들어라
re.split(("\n|\r\n"), tmp_string)

['BLT', 'Old Oak Tap', 'Read more ']

In [None]:
print(re.split(("\n|\r\n"), tmp_string)[0]) # 메인메뉴
print(re.split(("\n|\r\n"), tmp_string)[1]) # 카페이름

BLT
Old Oak Tap


In [97]:
# 링크
tmp_one.find('a')['href']

'/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

---

## 2. 시카고 맛집 데이터프레임 생성

In [114]:
# 50개 맛집의 정보를 반복문을 통해 리스트로 모은다
from urllib.parse import urljoin 
url_base = "http://www.chicagomag.com"

rank = [] 
main_menu = [] 
cafe_name = [] 
url_add = [] 

list_html = html.find_all("div", "sammy") # html.select(".sammy")

for item in list_html: 
    rank.append(item.find(class_="sammyRank").get_text()) # 랭크
    tmp_string = item.find(class_="sammyListing").get_text() 
    main_menu.append(re.split(("\n|\r\n"), tmp_string)[0]) # 메인메뉴
    cafe_name.append(re.split(("\n|\r\n"), tmp_string)[1]) # 카페이름
    url_add.append(urljoin(url_base, item.find("a")["href"])) # 링크

In [115]:
# 50개의 정보가 모두 잘 불러와졌는지 확인
len(rank), len(main_menu), len(cafe_name), len(url_add)

(50, 50, 50, 50)

In [117]:
# 데이터 프레임으로 만든다
import pandas as pd 

data = {
    "Rank": rank, 
    "Menu": main_menu,
    "Cafe": cafe_name,
    "URL": url_add, 
}

df = pd.DataFrame(data)

In [118]:
df.tail(3)

Unnamed: 0,Rank,Menu,Cafe,URL
47,48,Beef Curry,Zenwich,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Le Végétarien,Toni Patisserie,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,The Gatsby,Phoebe’s Bakery,https://www.chicagomag.com/Chicago-Magazine/No...


In [121]:
# 컬럼 순서 변경
# 1. df = df[['d','c','a','b']]
# 2. df = pd.DataFrame(data, columns=['d','c','a','b'])
df = df[["Rank", "Cafe", "Menu", "URL"]]
df.tail(3)

Unnamed: 0,Rank,Cafe,Menu,URL
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [122]:
# 데이터 저장
df.to_csv(
    "../data/best_sandwiches_list_chicago.csv", sep=",", encoding="utf-8"
)

---

## 3. 시카고 맛집 데이터프레임 전처리

- 가격, 주소 정보 스크래핑

In [126]:
df = pd.read_csv("../data/best_sandwiches_list_chicago.csv", index_col=0)
df.tail()

Unnamed: 0,Rank,Cafe,Menu,URL
45,46,Chickpea,Kufta,https://www.chicagomag.com/Chicago-Magazine/No...
46,47,The Goddess and Grocer,Debbie’s Egg Salad,https://www.chicagomag.com/Chicago-Magazine/No...
47,48,Zenwich,Beef Curry,https://www.chicagomag.com/Chicago-Magazine/No...
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [128]:
# 첫번째 URL 주소
df["URL"][0]

'http://www.chicagomag.com/Chicago-Magazine/November-2012/Best-Sandwiches-in-Chicago-Old-Oak-Tap-BLT/'

In [167]:
# 첫번째 URL 주소에서 가격, 주소 정보를 가져온다
req = Request(df["URL"][0], headers={"user-agent":ua.ie})
res = urlopen(req).read()
html = BeautifulSoup(res, "html.parser")
html.find("p", "addy")

<p class="addy">
<em>$10. 2109 W. Chicago Ave., 773-772-0406, <a href="http://www.theoldoaktap.com/">theoldoaktap.com</a></em></p>

In [148]:
# 필요한 텍스트를 뽑아낸 후 작업을 시작한다
source = html.find("p", "addy").text
source

'\n$10. 2109 W. Chicago Ave., 773-772-0406, theoldoaktap.com'

In [149]:
# source에서 필요한 부분만 뽑아낸다 
import re 
re.split(".,", source)

['\n$10. 2109 W. Chicago Ave', ' 773-772-040', ' theoldoaktap.com']

In [151]:
price_address = re.split(".,", source)[0]
price_address

'\n$10. 2109 W. Chicago Ave'

In [155]:
# 정규표현식을 활용하여 가격 정보만 추출
price = re.search("\$\d+\.(\d+)?", price_address).group()
price

'$10.'

In [156]:
len(price)

4

In [161]:
# 슬라이싱으로 주소 정보만 추출
address = price_address[len(price) + 2:]
address

'2109 W. Chicago Ave'

In [177]:
# 테스트 1
price_list = [] 
address_list = [] 

for i in df.index[:3]:
    req = Request(df["URL"][i], headers={"user-agent":ua.ie})
    res = urlopen(req).read()
    html = BeautifulSoup(res, "html.parser")
    source = html.find("p", "addy").get_text()
    price_address = re.split(".,", source)[0]
    price = re.search("\$\d+\.(\d+)?", price_address).group()
    address = price_address[len(price) + 2:]
    price_list.append(price)
    address_list.append(address)
    
    print(i)

0
1
2


In [178]:
price_list

['$10.', '$9.', '$9.50']

In [179]:
address_list

['2109 W. Chicago Ave', '800 W. Randolph St', ' 445 N. Clark St']

In [185]:
# 테스트 2 (iterrows 활용)
price_list = [] 
address_list = [] 

for idx, row in df[:3].iterrows():
    req = Request(row["URL"], headers={"user-agent":ua.ie})
    res = urlopen(req).read()
    
    html = BeautifulSoup(res, "html.parser")
    
    source = html.find("p", "addy").get_text()
    price_address = re.split(".,", source)[0]
    price = re.search("\$\d+\.(\d+)?", price_address).group()
    address = price_address[len(price) + 2:]
    price_list.append(price)
    address_list.append(address)
    
    print(idx)

0
1
2


In [186]:
price_list

['$10.', '$9.', '$9.50']

In [187]:
address_list

['2109 W. Chicago Ave', '800 W. Randolph St', ' 445 N. Clark St']

In [190]:
# 테스트 3 (tqdm 활용)
# 설치가 안되어 있으면 (!pip install tqdm)
from tqdm import tqdm 

price = [] 
address = [] 

for idx, row in tqdm(df[:3].iterrows()):
    req = Request(row["URL"], headers={"user-agent":ua.ie})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, "html.parser")
    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp)+2:])
    print(idx)

1it [00:03,  3.45s/it]

0


2it [00:05,  2.42s/it]

1


3it [00:06,  2.28s/it]

2





In [191]:
price_list

['$10.', '$9.', '$9.50']

In [192]:
address_list

['2109 W. Chicago Ave', '800 W. Randolph St', ' 445 N. Clark St']

In [193]:
# 테스트가 모두 무사히 마무리 되었으므로 전체 데이터 스크래핑

price = [] 
address = [] 

for idx, row in tqdm(df.iterrows()):
    req = Request(row["URL"], headers={"user-agent":ua.ie})
    html = urlopen(req).read()
    soup_tmp = BeautifulSoup(html, "html.parser")
    gettings = soup_tmp.find("p", "addy").get_text()
    price_tmp = re.split(".,", gettings)[0]
    tmp = re.search("\$\d+\.(\d+)?", price_tmp).group()
    price.append(tmp)
    address.append(price_tmp[len(tmp)+2:])
    print(idx)

1it [00:01,  1.65s/it]

0


2it [00:03,  1.66s/it]

1


3it [00:04,  1.66s/it]

2


4it [00:06,  1.67s/it]

3


5it [00:08,  1.66s/it]

4


6it [00:09,  1.50s/it]

5


7it [00:11,  1.54s/it]

6


8it [00:13,  1.70s/it]

7


9it [00:14,  1.68s/it]

8


10it [00:16,  1.67s/it]

9


11it [00:17,  1.58s/it]

10


12it [00:19,  1.48s/it]

11


13it [00:20,  1.50s/it]

12


14it [00:22,  1.53s/it]

13


15it [00:23,  1.41s/it]

14


16it [00:24,  1.47s/it]

15


17it [00:26,  1.48s/it]

16


18it [00:28,  1.52s/it]

17


19it [00:29,  1.42s/it]

18


20it [00:30,  1.35s/it]

19


21it [00:31,  1.30s/it]

20


22it [00:33,  1.35s/it]

21


23it [00:34,  1.30s/it]

22


24it [00:35,  1.26s/it]

23


25it [00:37,  1.48s/it]

24


26it [00:38,  1.41s/it]

25


27it [00:40,  1.49s/it]

26


28it [00:45,  2.47s/it]

27


29it [00:46,  2.07s/it]

28


30it [00:47,  1.92s/it]

29


31it [00:48,  1.69s/it]

30


32it [00:50,  1.65s/it]

31


33it [00:51,  1.50s/it]

32


34it [00:52,  1.40s/it]

33


35it [00:54,  1.43s/it]

34


36it [00:55,  1.44s/it]

35


37it [00:56,  1.37s/it]

36


38it [00:58,  1.29s/it]

37


39it [00:59,  1.35s/it]

38


40it [01:01,  1.43s/it]

39


41it [01:02,  1.48s/it]

40


42it [01:04,  1.42s/it]

41


43it [01:05,  1.34s/it]

42


44it [01:06,  1.28s/it]

43


45it [01:07,  1.26s/it]

44


46it [01:08,  1.22s/it]

45


47it [01:09,  1.19s/it]

46


48it [01:10,  1.17s/it]

47


49it [01:12,  1.17s/it]

48


50it [01:13,  1.47s/it]

49





In [195]:
# 50개 모두 정상적으로 완료
len(price), len(address)

(50, 50)

In [196]:
price[:5]

['$10.', '$9.', '$9.50', '$9.40', '$10.']

In [197]:
address[:5]

['2109 W. Chicago Ave',
 '800 W. Randolph St',
 ' 445 N. Clark St',
 ' 914 Noyes St',
 '825 W. Fulton Mkt']

In [None]:
# 기존 데이터에 새로운 컬럼을 생성하고 해당 데이터를 넣을 것
df.tail(2)

Unnamed: 0,Rank,Cafe,Menu,URL
48,49,Toni Patisserie,Le Végétarien,https://www.chicagomag.com/Chicago-Magazine/No...
49,50,Phoebe’s Bakery,The Gatsby,https://www.chicagomag.com/Chicago-Magazine/No...


In [198]:
df["Price"] = price 
df["Address"] = address
df = df.loc[:, ["Rank", "Cafe", "Menu", "Price", "Address"]]
df.set_index("Rank", inplace=True)
df.head()

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10.,2109 W. Chicago Ave
2,Au Cheval,Fried Bologna,$9.,800 W. Randolph St
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St
5,Publican Quality Meats,PB&L,$10.,825 W. Fulton Mkt


In [199]:
# csv 파일로 저장
df.to_csv(
    "../data/best_sandwiches_list_chicago2.csv", sep=",", encoding="UTF-8"
)

In [202]:
pd.read_csv("../data/best_sandwiches_list_chicago2.csv",index_col=0)

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,Old Oak Tap,BLT,$10.,2109 W. Chicago Ave
2,Au Cheval,Fried Bologna,$9.,800 W. Randolph St
3,Xoco,Woodland Mushroom,$9.50,445 N. Clark St
4,Al’s Deli,Roast Beef,$9.40,914 Noyes St
5,Publican Quality Meats,PB&L,$10.,825 W. Fulton Mkt
6,Hendrickx Belgian Bread Crafter,Belgian Chicken Curry Salad,$7.25,100 E. Walton St
7,Acadia,Lobster Roll,$16.,1639 S. Wabash Ave
8,Birchwood Kitchen,Smoked Salmon Salad,$10.,2211 W. North Ave
9,Cemitas Puebla,Atomica Cemitas,$9.,3619 W. North Ave
10,Nana,Grilled Laughing Bird Shrimp and Fried Po’ Boy,$17.,3267 S. Halsted St


---

## 4. 시카고 맛집 데이터 지도 시각화

In [203]:
# requirements 
import pandas as pd 
import numpy as np
import googlemaps
import folium

In [None]:
df = pd.read_csv("../data/best_sandwiches_list_chicago2.csv", index_col=0)
df.tail(10)

Unnamed: 0_level_0,Cafe,Menu,Price,Address
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
41,Z&H MarketCafe,The Marty,$7.25,1323 E. 57th St
42,Market House on the Square,Whitefish,$11.,655 Forest Ave
43,Elaine’s Coffee Call,"Oat Bread, Pecan Butter, and Fruit Jam",$6.,Hotel Lincol
44,Marion Street Cheese Market,Cauliflower Melt,$9.,100 S. Marion St
45,Cafecito,Cubana,$5.49,26 E. Congress Pkwy
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl
48,Zenwich,Beef Curry,$7.50,416 N. York St
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa


In [206]:
# 구글맵스를 활용해 위도, 경도 데이터를 가져올 것
gmaps_key = "AIzaSyBvIJqxEM5TJDyTk1mGdR0GkmnXddWMIzM"
gmaps = googlemaps.Client(key=gmaps_key)

In [207]:
lat = [] 
lng = [] 


for idx, row in tqdm(df.iterrows()):
    if not row["Address"] == "Multiple location":
        target_name = row["Address"] + ", " + "Chicago"
        gmaps_output = gmaps.geocode(target_name)
        geo_data = gmaps_output[0].get("geometry")
        lat.append(geo_data["location"]["lat"])
        lng.append(geo_data["location"]["lng"])

    # 주소가 Multipe location인 경우 NaN값 할당
    else:
        lat.append(np.nan)
        lng.append(np.nan)

50it [00:10,  4.68it/s]


In [209]:
# 조회된 정보 확인
len(lat), len(lng)

(50, 50)

In [210]:
# 위도, 경도 컬럼 생성
df["lat"] = lat 
df["lng"] = lng 
df.tail()

Unnamed: 0_level_0,Cafe,Menu,Price,Address,lat,lng
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
46,Chickpea,Kufta,$8.,2018 W. Chicago Ave,41.896113,-87.677857
47,The Goddess and Grocer,Debbie’s Egg Salad,$6.50,25 E. Delaware Pl,41.898979,-87.627393
48,Zenwich,Beef Curry,$7.50,416 N. York St,41.910583,-87.940488
49,Toni Patisserie,Le Végétarien,$8.75,65 E. Washington St,41.883106,-87.625438
50,Phoebe’s Bakery,The Gatsby,$6.85,3351 N. Broadwa,41.942725,-87.644287


In [218]:
# 포리움을 활용하여 지도 시각화

mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)
mapping

In [231]:
mapping = folium.Map(location=[41.8781136, -87.6297982], zoom_start=11)
for idx, row in df.iterrows():
    if not row["Address"] == "Multiple location":
        folium.Marker(
            location=[row["lat"], row["lng"]],
            popup='['+row["Cafe"]+'] : '+row["Menu"],
            icon=folium.Icon(
                icon="coffee",
                prefix="fa"
            )
        ).add_to(mapping)

mapping