# Accessing Web Data with Python

## Chapter 12: Networked Programs

In [5]:
# The world’s simplest web browser

import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
# "\r\n" signifies an EOL (end of line), so "\r\n\r\n" signifies nothing between two EOL sequences. That is the equivalent of a blank line.
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Mon, 23 May 2022 12:22:52 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


In [6]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/intro-short.txt HTTP/1.0\r\n\r\n'.encode()
# "\r\n" signifies an EOL (end of line), so "\r\n\r\n" signifies nothing between two EOL sequences. That is the equivalent of a blank line.
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Mon, 23 May 2022 14:57:53 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "1d3-54f6609240717"
Accept-Ranges: bytes
Content-Length: 467
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

Why should you learn to write programs?

Writing programs (or programming) is a very creative 
and rewarding activity.  You can write programs for 
many reasons, ranging from making your living to solving
a difficult data analysis problem to having fun to helping
someone else solve a problem.  This book assumes that 
everyone needs to know how to program, and that once 
you know how to program you will figure out what you want 
to do with your newfound skills.  


In [12]:
# Retrieving Web Pages

import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')
for line in fhand:
    print(line.decode(),end='')

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


In [17]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')

counts = dict()

for line in fhand:
    words = line.decode().split()
    for word in words:
        counts[word] = counts.get(word, 0) + 1
        
print(counts)

{'But': 1, 'soft': 1, 'what': 1, 'light': 1, 'through': 1, 'yonder': 1, 'window': 1, 'breaks': 1, 'It': 1, 'is': 3, 'the': 3, 'east': 1, 'and': 3, 'Juliet': 1, 'sun': 2, 'Arise': 1, 'fair': 1, 'kill': 1, 'envious': 1, 'moon': 1, 'Who': 1, 'already': 1, 'sick': 1, 'pale': 1, 'with': 1, 'grief': 1}


In [46]:
# Parsing Web Pages

# Install BeautifulSoup: pip install beautifulsoup4


# Get all the links from a page:
from urllib.request import urlopen
from bs4 import BeautifulSoup

url = 'http://www.dr-chuck.com/page1.htm'
html = urllib.request.urlopen(url).read()
soup = BeautifulSoup(html, 'html.parser')
print(soup)
tags = soup('a')
for tag in tags:
    print(tag.get('href', None))

<h1>The First Page</h1>
<p>
If you like, you can switch to the 
<a href="http://www.dr-chuck.com/page2.htm">
Second Page</a>.
</p>

http://www.dr-chuck.com/page2.htm


In [62]:
# Assignment: Scraping HTML Data with BeautifulSoup

from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = 'http://py4e-data.dr-chuck.net/comments_1559329.html'
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the anchor tags
tags = soup('span')
total = 0
for tag in tags:
    total += int(tag.contents[0])

print(f'Count {len(tags)}')
print(f'Sum {total}')

Count 50
Sum 2440


In [10]:
# Assignment: Following Links in HTML Using BeautifulSoup

import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

# url = input('Enter URL: ')
# count = int(input('Enter count: '))
# position = int(input('Enter position: '))
url = 'http://py4e-data.dr-chuck.net/known_by_Fikret.html'
count = 4
position = 3

last_name = ''

for i in range(count+1):
    print(f'Retrieving: {url}')
    html = urllib.request.urlopen(url, context=ctx).read()
    soup = BeautifulSoup(html, 'html.parser')
    links = soup('a')

    if i == count-1:
        last_name=links[position-1].contents[0]
    
    url = links[position-1].get('href', None)

print(last_name)


Retrieving: http://py4e-data.dr-chuck.net/known_by_Fikret.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Montgomery.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Mhairade.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Butchi.html
Retrieving: http://py4e-data.dr-chuck.net/known_by_Anayah.html
Anayah


## Chapter 13: Using Web Services

### XML

In [25]:
# Parsing XML

import xml.etree.ElementTree as ET

data = '''
<person>
    <name>Vitor</name>
    <phone type="intl">+55 19 123456789</phone>
    <email hide="yes"/>
</person>
'''

tree = ET.fromstring(data)
print('Name:',tree.find('name').text)
print('Phone:',tree.find('phone').text)
print('Phone Attr:',tree.find('phone').get('type'))
print('Email Attr:',tree.find('email').get('hide'))

Name: Vitor
Phone: +55 19 123456789
Phone Attr: intl
Email Attr: yes


In [98]:
# Assignment Extracting Data from XML

import urllib.request, urllib.parse, urllib.error
import xml.etree.ElementTree as ET
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter location: ')
# url = 'http://py4e-data.dr-chuck.net/comments_42.xml'
print(f'Retrieving {url}')

xml = urllib.request.urlopen(url, context=ctx).read()
print(f'Retrieved {len(xml)} characters')

tree = ET.fromstring(xml)
counts = tree.findall('.//count')
total = 0
for count in counts:
    total += int(count.text)


print(f'Count: {len(counts)}')
print(f'Sum: {total}')


Retrieving http://py4e-data.dr-chuck.net/comments_305727.xml
Retrieved 4205 characters
Count: 50
Sum: 2549


### JSON

In [88]:
# Parsing JSON

import json

data = '''
{
  "name" : "Chuck",
  "phone" : {
    "type" : "intl",
    "number" : "+1 734 303 4456"
   },
   "email" : {
     "hide" : "yes"
   }
}
'''

info = json.loads(data)
print('Name:', info["name"])
print('Phone:', info["phone"]["number"])
print('Hide:', info["email"]["hide"])

Name: Chuck
Phone: +1 734 303 4456
Hide: yes


In [96]:
# Parsing JSON

import json

data = '''
[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  },
  { "id" : "009",
    "x" : "7",
    "name" : "Brent"
  }
]'''

info = json.loads(data)
print('User count:', len(info))

for item in info:
    print("-----")
    print('Name:', item['name'])
    print('Id:', item['id'])
    print('Attribute:', item['x'])

User count: 2
-----
Name: Chuck
Id: 001
Attribute: 2
-----
Name: Brent
Id: 009
Attribute: 7


### APIs

In [99]:
# Calling a JSON API

import urllib.request, urllib.parse, urllib.error
import json

url = input('Enter location: ')
# url = 'http://py4e-data.dr-chuck.net/comments_42.json'
print(f'Retrieving {url}')

data = urllib.request.urlopen(url).read()
print(f'Retrieved {len(data)} characters')

info = json.loads(data)
comments = info["comments"]
print(f'Count: {len(comments)}')

total = 0
for comment in comments:
  total += int(comment['count'])
print(f'Sum: {total}')

Retrieving http://py4e-data.dr-chuck.net/comments_305728.json
Retrieved 2731 characters
Count: 50
Sum: 2546


In [115]:
# Calling a JSON API

import urllib.request, urllib.parse, urllib.error
import json
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

api_key = 42
serviceurl = 'http://py4e-data.dr-chuck.net/json?'

while True:
    address = input('Enter location: ')
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)
    print(f'Retrieving {url}')

    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print(f'Retrieved {len(data)} characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    # print(json.dumps(js, indent=4))

    place_id = js['results'][0]['place_id']
    print(f'Place id {place_id}')

Retrieving http://py4e-data.dr-chuck.net/json?address=unicamp&key=42
Retrieved 2116 characters
Place id ChIJtU3SBbDGyJQRltcEzw5128Y
