-
Notifications
You must be signed in to change notification settings - Fork 0
/
step2_getDetails.py
77 lines (62 loc) · 2.59 KB
/
step2_getDetails.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
from selenium import webdriver
import time
import os
from bs4 import BeautifulSoup
import pandas as pd
import json
import step1_getData
def getDetails():
step1_getData.getData()
dfindex = pd.read_csv("Rindex.txt", sep = "|", encoding = 'utf-8')
dfindex['neighbor'] = "unknown"
dfindex['des'] = "unknown"
driverLocation = '/usr/local/bin/chromedriver'
driver = webdriver.Chrome(driverLocation)
# create a directory to save details
if not os.path.exists("./Details/"):
os.makedirs("./Details/")
for i,row in dfindex.iterrows():
if i % 100 == 0:
print("Crawling %d th restaurants..." % i)
tmpurl = row['url']
driver.get(tmpurl)
content = driver.page_source
time.sleep(5)
soup = BeautifulSoup(content, "html.parser")
# get neighbor
pn = soup.find("p", attrs = {"class": "font-weight-semi-bold text-muted mb-md-0"})
neighbor = pn.text.strip()
dfindex.at[i, 'neighbor'] = neighbor
# get description
pdes0 = content.split('class="mb-5 small">')[0]
pdes1 = pdes0.split('</div></div>')[-1]
pdessoup = BeautifulSoup(pdes1, "html.parser")
des = " ".join([p.text.strip() for p in pdessoup.findAll("p")])
dfindex.at[i, 'des'] = des
# get capacities etc.
fid = 0
fdict = {}
sections = content.split('<span class="col">')
for sec in sections:
sec_name = sec.split('</span>')[0]
rows = sec.split('<p class="mb-0 py-2 row border-top border-light">')
if len(rows) == 1:
continue
else:
spanrows = rows[1:]
for spanrow in spanrows:
rowsoup = BeautifulSoup(spanrow, "html.parser")
fspan = rowsoup.find("span", attrs = {"class": "font-weight-light col-9 col-sm-5 col-md-8"})
vspan = rowsoup.find("span", attrs = {"class": "col-3 col-sm-7 col-md-4"})
if fspan != None and vspan != None:
fdict[fid] = {'section': sec_name,
'feature': fspan.text.strip(),
'value': vspan.text.strip(),
}
fid += 1
tdf = pd.DataFrame.from_dict(fdict,orient='index')
tdf.to_csv("./Details/" + str(i) + ".txt",
sep = "|", encoding = 'utf-8', index = False)
dfindex.to_csv("Rindex_w_des.txt", sep = "|", encoding = 'utf-8', index=False)
if __name__ == '__main__':
getDetails()