# Crawling

In [1]:
from bs4 import BeautifulSoup

### HTML 파일 열기

In [2]:
with open('00_Example.html') as fp:
    soup = BeautifulSoup(fp, 'html.parser')
    print(soup)

<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0" name="viewport"/>
<title>Document</title>
</head>
<body>
<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>
<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>
<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>
<h1>This is a heading.</h1>
<p>This is a paragraph.</p>
<a class="a sample" href="www.naver.com">Naver</a>
</body>
</html>


### urllib를 통해서 웹에 있는 소스 가져오기

In [3]:
import urllib.request
import urllib.parse

In [4]:
# web_url에 원하는 웹의 URL을 넣어주면 됨
# web_url='https://naver.com'
web_url='http://200.1.220.217:3000/bbs/list/1'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')
    print(soup)

<!DOCTYPE html>

<html lang="ko">
<head>
<title>My BBS</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<link href="/bootstrap/css/bootstrap.min.css" rel="stylesheet"/>
<link href="/fontawesome/css/all.min.css" rel="stylesheet"/>
<script src="/jquery/jquery.min.js"></script>
<script src="/popper/popper.min.js"></script>
<script src="/bootstrap/js/bootstrap.min.js"></script>
</head>
<body>
<nav class="navbar navbar-expand-lg bg-dark navbar-dark fixed-top">
<a class="navbar-brand" href="#">
<img alt="호서직업능력개발원" src="/img/hoseo.png" style="height: 40px; margin-left: 50px; margin-right: 80px;"/>
</a>
<ul class="nav mr-auto">
<li class="nav-item nav-light">
<a class="nav-link" href="/"><i class="fas fa-home"></i>홈</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/bbs/write"><i class="far fa-edit"></i>글쓰기</a>
</li>
<li class="nav-item">
<a class="nav-link" href="/user/dispatch"><i class="far fa-user"></i>사용자</a>
</li>
<li class="n

## Beautiful Soup 사용법

In [5]:
with open('00_Example.html', encoding='utf8') as html:
    soup = BeautifulSoup(html, 'html.parser')

### 태그를 이용해서 찾기

- soup.find()
- soup.find_all()

In [6]:
# 하나의 태그 찾기
first_div = soup.find('div')
print(first_div)

<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>


In [7]:
# 모든 태그 찾기
all_divs = soup.find_all('div')
print(all_divs)

[<div>
<p>a</p>
<p>b</p>
<p>c</p>
</div>, <div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>, <div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>]


In [8]:
all_ps = soup.find_all('p')
print(all_ps)

[<p>a</p>, <p>b</p>, <p>c</p>, <p>1</p>, <p>2</p>, <p>3</p>, <p>X</p>, <p>Y</p>, <p>Z</p>, <p>This is a paragraph.</p>]


In [9]:
some_ps = first_div.find_all('p')
print(some_ps)

[<p>a</p>, <p>b</p>, <p>c</p>]


### 태그와 속성을 이용해서 찾기

- find('태그명', {'속성명1':'값1', ...})
- find_all('태그명', {'속성명1':'값1', ...})

In [10]:
ex_id = soup.find('div', {'id':'ex_id'})
print(ex_id)

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>


In [11]:
ex_id = soup.find('div', id='ex_id')
ex_id

<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>

In [12]:
ex_class = soup.find('div', {'class':'ex_class'})
print(ex_class)

<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>


In [13]:
ex_class = soup.find('div', 'ex_class')
print(ex_class)

<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>


In [14]:
ex_class = soup.find(class_ = 'ex_class')
print(ex_class)

<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>


### CSS Selector

- select_one('#idName') -> 객체 하나
- select('.className') -> 객체 리스트

In [15]:
ex_id = soup.select('#ex_id')
print(ex_id)

[<div id="ex_id">
<p>X</p>
<p>Y</p>
<p>Z</p>
</div>]


In [16]:
soup.select('.sample')

[<div class="ex_class sample">
 <p>1</p>
 <p>2</p>
 <p>3</p>
 </div>,
 <a class="a sample" href="www.naver.com">Naver</a>]

In [17]:
soup.select_one('.sample')

<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>

### 결과 가져오기

In [18]:
ex_id

[<div id="ex_id">
 <p>X</p>
 <p>Y</p>
 <p>Z</p>
 </div>]

In [19]:
first_p = ex_id[0].find('p')
first_p.get_text()

'X'

In [20]:
first_p.string

'X'

In [21]:
ex_class

<div class="ex_class sample">
<p>1</p>
<p>2</p>
<p>3</p>
</div>

In [22]:
class_ps = ex_class.find_all('p')
for one_p in class_ps:
    print(one_p.string)

1
2
3


### 속성값 가져오기

In [23]:
a_tag = soup.select_one('.a.sample')
a_tag.get_text()

'Naver'

In [24]:
a_tag.attrs['href']

'www.naver.com'

### BBS 사이트 크롤링

In [25]:
# web_url에 원하는 웹의 URL을 넣어주면 됨
# web_url='https://naver.com'
web_url='http://200.1.220.217:3000/bbs/list/1'
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [26]:
table = soup.find('table')
rows = table.select('.d-flex')
first_row = rows[1]
first_row

<tr class="d-flex">
<td class="col-1" style="text-align: center;">1013</td>
<td class="col-6"><a href="/bbs/bid/1013/inc/1"><strong>리눅스 글쓰기</strong></a></td>
<td class="col-2" style="text-align: center;">홍길동</td>
<td class="col-2" style="text-align: center;">2020-10-27</td>
<td class="col-1" style="text-align: center;">3</td>
</tr>

In [27]:
tds = first_row.find_all('td')
tds

[<td class="col-1" style="text-align: center;">1013</td>,
 <td class="col-6"><a href="/bbs/bid/1013/inc/1"><strong>리눅스 글쓰기</strong></a></td>,
 <td class="col-2" style="text-align: center;">홍길동</td>,
 <td class="col-2" style="text-align: center;">2020-10-27</td>,
 <td class="col-1" style="text-align: center;">3</td>]

In [28]:
for td in tds:
    print(td.string)

1013
리눅스 글쓰기
홍길동
2020-10-27
3


In [29]:
table = soup.select_one('table tr:nth-child(2) td')
print(table)

<td class="col-1" style="text-align: center;">1013</td>


In [30]:
table = soup.select('table tr:nth-child(2) td')
for td in table:
    print(td.string)

1013
리눅스 글쓰기
홍길동
2020-10-27
3


### 데이터 프레임으로 만들기

In [31]:
import re
trs = soup.find_all('tr')
bids = []; titles = []; names = []
times = []; view_counts = []; reply_counts = []
for tr in trs[1:]:
    tds = tr.find_all('td')
    bids.append(tds[0].get_text())
    titles.append(re.sub(r"\[(.*)\]", "", tds[1].get_text()))
    names.append(tds[2].get_text())
    times.append(tds[3].get_text())
    view_counts.append(tds[4].get_text())
    # Regex Reply
    matches = re.compile(r"\[(.*)\]").search(tds[1].get_text())
    reply_counts.append(matches.group(1) if matches != None else 0)

In [32]:
import pandas as pd
bbs = pd.DataFrame({
    'bid': bids, 
    'title': titles,
    'name': names,
    'time': times,
    'view_count': view_counts,
    'reply_count': reply_counts
})
bbs

Unnamed: 0,bid,title,name,time,view_count,reply_count
0,1013,리눅스 글쓰기,홍길동,2020-10-27,3,0
1,1012,마이크로소프트 엣지에서의 동작,김은숙,2020-10-23,14,2
2,1011,역적 - 백성을 훔친 도적,홍길동,2020-10-23,4,1
3,1010,홍길동전,홍길동,2020-10-23,3,0
4,1009,대조영,대조영,2020-10-23,9,2
5,1008,Linux server를 원격 Windows에서 접속하여 글쓰기,관리자,2020-10-23,7,3
6,1007,ubuntu에서 글쓰기,관리자,2020-10-22,5,1
7,1006,슬기로운 의사생활,이우정,2020-10-22,12,3
8,1005,파리의 연인,김은숙,2020-10-22,9,4
9,1004,시크릿 가든,김은숙,2020-10-22,4,0


### 페이지 넘버 크롤링

In [62]:
# web_url에 원하는 웹의 URL을 넣어주면 됨
# web_url='https://naver.com'
base_url = 'http://200.1.220.217:3000/bbs/list/'
page_url = '1'
web_url = base_url + page_url
with urllib.request.urlopen(web_url) as response:
    html = response.read()
    soup = BeautifulSoup(html, 'html.parser')

In [87]:
li = soup.select('.pagination li')
page_num = re.sub(pattern='\(current\)', repl='', string=li[-2].get_text())
page = int(page_num)
bids = []; titles = []; names = []
times = []; view_counts = []; reply_counts = []

for i in range(page):

    page_url = str(i + 1)
    web_url = base_url + page_url

    with urllib.request.urlopen(web_url) as response:
        html = response.read()
        soup = BeautifulSoup(html, 'html.parser')

        trs = soup.find_all('tr')

        for tr in trs[1:]:
            tds = tr.find_all('td')

            bids.append(tds[0].get_text())
            title = tds[1].get_text()
            titles.append(re.sub(r"\[(.*)\]", "", title))
            names.append(tds[2].get_text())
            times.append(tds[3].get_text())
            view_counts.append(tds[4].get_text())

            # Reply
            matches = re.compile(r"\[(.*)\]").search(title)
            reply_counts.append(matches.group(1) if matches != None else 0)

In [88]:
import pandas as pd
bbs = pd.DataFrame({
    'bid': bids, 
    'title': titles,
    'name': names,
    'time': times,
    'view_count': view_counts,
    'reply_count': reply_counts
})
bbs = bbs.set_index('bid')
bbs

Unnamed: 0_level_0,title,name,time,view_count,reply_count
bid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1013,리눅스 글쓰기,홍길동,2020-10-27,3,0
1012,마이크로소프트 엣지에서의 동작,김은숙,2020-10-23,14,2
1011,역적 - 백성을 훔친 도적,홍길동,2020-10-23,4,1
1010,홍길동전,홍길동,2020-10-23,3,0
1009,대조영,대조영,2020-10-23,9,2
1008,Linux server를 원격 Windows에서 접속하여 글쓰기,관리자,2020-10-23,7,3
1007,ubuntu에서 글쓰기,관리자,2020-10-22,5,1
1006,슬기로운 의사생활,이우정,2020-10-22,12,3
1005,파리의 연인,김은숙,2020-10-22,9,4
1004,시크릿 가든,김은숙,2020-10-22,4,0
