### HTTP messages are how data is exchanged between a server and a client. There are two types of messages: requests sent by the client to trigger an action on the server, and responses, the answer from the server.

The two most common HTTP methods are: GET and POST.

In [None]:
import requests

response = requests.get("http://yahoo.com.tw")
response

In [None]:
response.encoding

response.text returns the content in unicode

response.content returns the content in bytes

In [None]:
response.content

### BeautifulSoup https://www.crummy.com/software/BeautifulSoup/

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, 'lxml')
print(soup.prettify())

In [None]:
html_doc = """
<html><head><title>Hello World</title></head>
<body><h2>Test Header</h2>
<p>This is a test.</p>
<table><tr><td>
<a id="link1" class="zzz" href="/my_link1">Link text</a>
</td><td>
<a id="link2" class="123" href="/my_link2">Link text</a>
</td></tr>
</table>
<p>Hello, <b class="boldtext">Bold text</b><b class="zzz">zzz text</b></p>
</body></html>
"""

soup = BeautifulSoup(html_doc, 'lxml')
print(soup.prettify())

In [None]:
print(soup.find("a"))

In [None]:
print(soup.a)

In [None]:
print(soup.html.a)

In [None]:
print(soup.find("a").text)

In [None]:
print(soup.find("a").attrs)

In [None]:
print(soup.find("a")["href"])

In [None]:
print(soup.find_all("a"))

In [None]:
a_tags = soup.find_all('a')
for tag in a_tags:
    print(tag.string)

In [None]:
for tag in a_tags:
    print(tag.get('href'))


In [None]:
tags = soup.find_all(["a", "b"])
print(tags)

In [None]:
tags = soup.find_all("", {"class":"zzz"})
print(tags)

In [None]:
tags = soup.find_all("td")[1].find("a")
print(tags)

In [None]:
tags = soup.find_all("", text = "Link text")
print(tags)

### Regular Expression

In [None]:
import re
test_string = "aabb abb aab"
pattern ="a+b"
ans=re.findall(pattern, test_string)
print(ans)

In [None]:
test_string = "find abbbbc, bc, skip c, acc"
pattern ="a*b+c"
ans=re.findall(pattern, test_string)
print(ans)

In [None]:
# finding numbers
test_string = "12 Drummers Drumming, 11 Pipers Piping, 10 Lords a Leaping"
pattern ="[0-9]+"
ans=re.findall(pattern, test_string)
print(ans)

In [None]:
# finding substring
test_string = "find can, man, fan skip dan, ran, pan "
pattern ="[cmf]an"
ans=re.findall(pattern, test_string)
print(ans)

In [None]:
# finding special characters
test_string = "find 591., dot., yes., skip non! "
pattern =".{3}\."
ans=re.findall(pattern, test_string)
print(ans)

In [None]:
# find in conditions
test_string = "find I love cats, I love dogs,  skip I love logs, I love cogs "
pattern ="I love cats|I love dogs"
ans=re.findall(pattern, test_string)
print(ans)

### BeautifulSoup + Regular Expression

In [None]:
import requests, re
response = requests.get("http://yp.518.com.tw/service-life.html?ctf=10")
response

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")
print(soup.find_all("li",{"class":"comp_loca"}, text=re.compile("新北")))

### Dynamic Website 

In [None]:
import requests
from bs4 import BeautifulSoup
form_data={
    'name1' : 'value1',
    'name2' : 'value2'
}

response_post = requests.post("http://mops.twse.com.tw/mops/web/t51sb01", data= form_data)
soup_post = BeautifulSoup(response_post.text, "lxml")
print(soup_post.prettify())

In [None]:
import re
selects= soup_post.find_all('select')
options = selects[1].find_all('option')
for option in options:
    print(option.text)

In [None]:
import requests
from bs4 import BeautifulSoup
form_data={
    "encodeURIComponent" : "1",
    "step" :"1",
    "firstin":"1",
    "TYPEK":"sii",
    "code":"01"
}

response_post = requests.post("http://mops.twse.com.tw/mops/web/t51sb01", data= form_data)
soup_post = BeautifulSoup(response_post.text, "lxml")
print(soup_post.prettify())

In [None]:
tables = soup_post.find_all('table')
for tr in tables[13].find_all('tr'):
    line="";
    for td in tr.find_all('td'):
        line= line+" "+td.text
    line = line+"\n"
    print(line)
print(tables[14].td.text)

### Save scrapping data into csv

In [None]:
import requests, re
response = requests.get("http://yp.518.com.tw/service-life.html?ctf=10")
response.encoding

In [None]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "lxml")
name_phone = [tag.text for tag in soup.find_all("li",{"class":"comp_tel"})]
name_phone

In [None]:
address = [tag.text for tag in soup.find_all("li",class_="comp_loca")]
address

In [None]:
name_phone_str = "".join(name_phone)
phone = re.findall("[0-9]{2}-[0-9]+",name_phone_str)
phone

In [None]:
name = [x.split("/")[0].strip() for x in name_phone]
name

In [None]:
df = pd.DataFrame({"店名":name, "地址":address, "電話":phone}, columns =["店名","地址","電話"])
df

In [None]:
df.to_csv("csv_results/518practice.csv",index = False, encoding="UTF-8")