# F28. Web and Crawling
## 1. Internet and Protocol

In [1]:
# socket프로그래밍 예제
import socket
def main():
    s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    s.connect(('google.com', 80))
    request = 'GET http://google.com HTTP/1.1\n\n'.encode()
    s.send(request)
    print(s.recv(4096).decode())

main()

HTTP/1.1 301 Moved Permanently
Location: http://www.google.com/
Content-Type: text/html; charset=UTF-8
Date: Fri, 27 Aug 2021 00:46:53 GMT
Expires: Sun, 26 Sep 2021 00:46:53 GMT
Cache-Control: public, max-age=2592000
Server: gws
Content-Length: 219
X-XSS-Protection: 0
X-Frame-Options: SAMEORIGIN

<HTML><HEAD><meta http-equiv="content-type" content="text/html;charset=utf-8">
<TITLE>301 Moved</TITLE></HEAD><BODY>
<H1>301 Moved</H1>
The document has moved
<A HREF="http://www.google.com/">here</A>.
</BODY></HTML>



## 2. API 이용하기

In [11]:
import requests
myapi = "62072b4644202b7dac5ec5cfd6f3b0c5"

#lat: 위도, #lon:경도
def makeurl(lat, lng):
    url = "https://api.openweathermap.org/data/2.5/onecall?lat={}&lon={}&appid={}".format(lat, lng, myapi)
    return url

response = requests.get(makeurl(37,127))
weather = response.json()
weather

{'lat': 37,
 'lon': 127,
 'timezone': 'Asia/Seoul',
 'timezone_offset': 32400,
 'current': {'dt': 1630029857,
  'sunrise': 1630011519,
  'sunset': 1630058940,
  'temp': 294.73,
  'feels_like': 295.32,
  'pressure': 1014,
  'humidity': 91,
  'dew_point': 293.2,
  'uvi': 0.73,
  'clouds': 90,
  'visibility': 6437,
  'wind_speed': 0,
  'wind_deg': 0,
  'weather': [{'id': 500,
    'main': 'Rain',
    'description': 'light rain',
    'icon': '10d'},
   {'id': 701, 'main': 'Mist', 'description': 'mist', 'icon': '50d'}],
  'rain': {'1h': 0.31}},
 'minutely': [{'dt': 1630029900, 'precipitation': 0.316},
  {'dt': 1630029960, 'precipitation': 0.316},
  {'dt': 1630030020, 'precipitation': 0.316},
  {'dt': 1630030080, 'precipitation': 0.316},
  {'dt': 1630030140, 'precipitation': 0.316},
  {'dt': 1630030200, 'precipitation': 0.316},
  {'dt': 1630030260, 'precipitation': 0.316},
  {'dt': 1630030320, 'precipitation': 0.316},
  {'dt': 1630030380, 'precipitation': 0.316},
  {'dt': 1630030440, 'precipi

## 3. Web Crawling 만들기

In [12]:
import urllib
def download(url):
    return urllib.request.urlopen(url)

In [13]:
# 오류 코드 및 예외처리 코드
from urllib.error import URLError, HTTPError, ContentTooShortError

def download(url):
    try:
        html = urllib.request.urlopen(url)
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error', e.reason)
        html = None
    return html

download('https://www.google.com')

<http.client.HTTPResponse at 0x7f2b972f6b10>

In [14]:
# read() 메소드 - 웹데이터 가져오기
def download(url):
    try:
        html = urllib.request.urlopen(url).read()
    except (URLError, HTTPError, ContentTooShortError) as e:
        print('Download error', e.reason)
        html = None
    return html

In [15]:
# html 읽어오기
download('https://www.google.com')

b'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2021/doodle-champion-island-games-august-26-6753651837109002-l.png" itemprop="image"><meta content="Doodle Champion Island Games!" property="twitter:title"><meta content="Feline up for a challenge? Help Lucky claim victory in the Doodle Champion Island Games! #GoogleDoodle " property="twitter:description"><meta content="Feline up for a challenge? Help Lucky claim victory in the Doodle Champion Island Games! #GoogleDoodle " property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content

In [16]:
# 웹페이지 다운로드
import requests
url = 'http://www.google.com'
response = requests.get(url)
response

<Response [200]>

In [17]:
def download2(url):
    try:
        response = requests.get(url)
        html = response.text
    except requests.ConnectionError:
        print('Connection error')
        html = None
    return html

download2('https://www.google.com')

'<!doctype html><html itemscope="" itemtype="http://schema.org/WebPage" lang="en"><head><meta content="Search the world\'s information, including webpages, images, videos and more. Google has many special features to help you find exactly what you\'re looking for." name="description"><meta content="noodp" name="robots"><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"><meta content="/logos/doodles/2021/doodle-champion-island-games-august-26-6753651837109002-l.png" itemprop="image"><meta content="Doodle Champion Island Games!" property="twitter:title"><meta content="Feline up for a challenge? Help Lucky claim victory in the Doodle Champion Island Games! #GoogleDoodle " property="twitter:description"><meta content="Feline up for a challenge? Help Lucky claim victory in the Doodle Champion Island Games! #GoogleDoodle " property="og:description"><meta content="summary_large_image" property="twitter:card"><meta content="@GoogleDoodles" property="twitter:site"><meta content=

In [18]:
# 웹페이지 분석
import requests
from bs4 import BeautifulSoup
html = requests.get('http://www.google.com')
soup = BeautifulSoup(html.text, 'html.parser')

In [19]:
soup.html.body

<body bgcolor="#fff"><script nonce="cZsezmtv/q2ONyzD4C6MaQ==">(function(){var src='/images/nav_logo229.png';var iesg=false;document.body.onload = function(){window.n && window.n();if (document.images){new Image().src=src;}
if (!iesg){document.f&&document.f.q.focus();document.gbqf&&document.gbqf.q.focus();}
}
})();</script><div id="mngb"><div id="gbar"><nobr><b class="gb1">Search</b> <a class="gb1" href="http://www.google.com/imghp?hl=en&amp;tab=wi">Images</a> <a class="gb1" href="http://maps.google.com/maps?hl=en&amp;tab=wl">Maps</a> <a class="gb1" href="https://play.google.com/?hl=en&amp;tab=w8">Play</a> <a class="gb1" href="http://www.youtube.com/?gl=US&amp;tab=w1">YouTube</a> <a class="gb1" href="https://news.google.com/?tab=wn">News</a> <a class="gb1" href="https://mail.google.com/mail/?tab=wm">Gmail</a> <a class="gb1" href="https://drive.google.com/?tab=wo">Drive</a> <a class="gb1" href="https://www.google.com/intl/en/about/products?tab=wh" style="text-decoration:none"><u>More</u>

In [20]:
soup = BeautifulSoup("<span>Wow it's so good!!</span>", 'html.parser')
soup.span

<span>Wow it's so good!!</span>

In [21]:
# ''', """ 는 여러줄의 문자열을 입력할 때 사용
html='''<title>Fundamental</title> 
         <body>
          <p id='programming'>python</p> 
          <p id='programming'>java</p> 
          <p id='algorithm'>algorithm</p> 
          <p id='fundamental'>math</p> 
          <p id='programming'>C++</p> 
          </body>'''
soup = BeautifulSoup(html, 'html.parser')
soup.findAll({'p'})

[<p id="programming">python</p>,
 <p id="programming">java</p>,
 <p id="algorithm">algorithm</p>,
 <p id="fundamental">math</p>,
 <p id="programming">C++</p>]

In [23]:
soup.findAll('p', id='programming')

[<p id="programming">python</p>,
 <p id="programming">java</p>,
 <p id="programming">C++</p>]

In [32]:
import os

# 로컬 유저라면
# wd_path = os.getenv('HOME')+'/aiffel/lib/chromedriver'
# driver = webdriver.Chrome(wd_path)   # 크롬드라이버를 통해 브라우저를 띄우고

# 클라우드 유저라면
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome('chromedriver', options=chrome_options)

# 웹드라이버 실행 및 페이지 이동
driver.get(crawling_urls['산과공원'])      # 우리가 원하는 URL로 이동합니다.
time.sleep(5)       # 해당 화면이 다 로딩할 때까지 5초간 충분히 기다려 줍니다. 
    
#csv파일 다운로드 버튼 클릭하기
driver.find_element_by_css_selector("#btnCsv").click()   # 사람이 누른 것처럼 다운로드 버튼을 클릭한 후
time.sleep(3)     # 다운로드가 완료될 때까지 3초간 기다려 줍니다. 

driver.quit()      # 브라우저를 닫습니다.

In [33]:
# 로컬 유저라면
# _dir = os.getenv('HOME')+'/다운로드'
#_dir = os.getenv('HOME')+'/Downloads'   # 영문 우분투 사용자라면 이 경로를 선택해 주세요.

# 클라우드 유저라면
_dir = os.getenv('HOME')

# 로컬, 클라우드 모두에게 해당되는 코드입니다
files = glob.glob('{}/서울시*.csv'.format(_dir))
print(files)

['/aiffel/서울시 산과공원 생태관광 정보 (한국어).csv']


In [34]:
#csv파일을 dataframe으로 변환하기
#인코딩 에러 발생시에 encoding옵션 추가
f_M_park = pd.read_csv(files[0], encoding='CP949')   #CP949: windows에서 사용하는 인코딩 방식
f_M_park.head(3)

Unnamed: 0,키,명칭,대분류,주소,행정 시,행정 구,행정 동,대표전화,면적,지정일,교?안내
0,BE_IW14-0020,진관내동 생태경관보전지역,생태탐방,은평구 진관동 282-1번지 일대(북한산국립공원 북한산성 입구 주변 습지 ),서울특별시,은평구,진관동,02-2115-7550~5 02-350-1397,16639㎡,2002년 12월 30일,지하철 3호선 구파발역 1번 출구에서 704번 34번 버스를 타고 북한산성 입구에서...
1,BE_IW14-0109,안산공원,산과공원,서울특별시 서대문구 홍제동 산33번지 일대,서울특별시,서대문구,홍제1동,02-330-1395,,,
2,BE_IW14-0110,여의도공원,산과공원,서울특별시 영등포구 여의공원로68(여의도동 2번지),서울특별시,영등포구,여의동,02-761-4079,,,
