# From "Using Web Services" PY4e

## Scrapes a webpage for its data using sockets 

In [4]:
import socket

mysock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
mysock.connect(('data.pr4e.org', 80))
cmd = 'GET http://data.pr4e.org/romeo.txt HTTP/1.0\r\n\r\n'.encode()
mysock.send(cmd)

while True:
    data = mysock.recv(512)
    if len(data) < 1:
        break
    print(data.decode(),end='')

mysock.close()

HTTP/1.1 200 OK
Date: Sun, 12 Apr 2020 06:01:54 GMT
Server: Apache/2.4.18 (Ubuntu)
Last-Modified: Sat, 13 May 2017 11:22:22 GMT
ETag: "a7-54f6609245537"
Accept-Ranges: bytes
Content-Length: 167
Cache-Control: max-age=0, no-cache, no-store, must-revalidate
Pragma: no-cache
Expires: Wed, 11 Jan 1984 05:00:00 GMT
Connection: close
Content-Type: text/plain

But soft what light through yonder window breaks
It is the east and Juliet is the sun
Arise fair sun and kill the envious moon
Who is already sick and pale with grief


## Using urllib instead

In [14]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://toluwee.github.io')
for line in fhand:
    print(line.decode().strip())

<!DOCTYPE html>
<html lang="en"><head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1"><!-- Begin Jekyll SEO tag v2.6.1 -->
<title>My Machine Learning Journey | Toluwee’s Blog</title>
<meta name="generator" content="Jekyll v3.8.5" />
<meta property="og:title" content="My Machine Learning Journey" />
<meta property="og:locale" content="en_US" />
<meta name="description" content="I hope to document my personal advancement journey in Machine learning, Data Science and Artificial Intelligence training! All the best to me o!!" />
<meta property="og:description" content="I hope to document my personal advancement journey in Machine learning, Data Science and Artificial Intelligence training! All the best to me o!!" />
<link rel="canonical" href="https://toluwee.github.io/" />
<meta property="og:url" content="https://toluwee.github.io/" />
<meta property="og:site_name" content="Toluwee’s Blog" />


## Counting the frequency 

In [12]:
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://toluwee.github.io/')

counts = dict()
for line in fhand:
    words = line.decode().split()
    for word in words:
        counts[word] = counts.get(word, 0) + 1
print(counts)


{'<!DOCTYPE': 1, 'html>': 1, '<html': 1, 'lang="en"><head>': 1, '<meta': 10, 'charset="utf-8">': 1, 'http-equiv="X-UA-Compatible"': 1, 'content="IE=edge">': 1, 'name="viewport"': 1, 'content="width=device-width,': 1, 'initial-scale=1"><!--': 1, 'Begin': 1, 'Jekyll': 2, 'SEO': 2, 'tag': 2, 'v2.6.1': 1, '-->': 2, '<title>My': 1, 'Machine': 9, 'Learning': 5, 'Journey': 1, '|': 1, 'Toluwee’s': 1, 'Blog</title>': 1, 'name="generator"': 1, 'content="Jekyll': 1, 'v3.8.5"': 1, '/>': 11, 'property="og:title"': 1, 'content="My': 1, 'Journey"': 1, 'property="og:locale"': 1, 'content="en_US"': 1, 'name="description"': 1, 'content="I': 2, 'hope': 4, 'to': 10, 'document': 4, 'my': 5, 'personal': 4, 'advancement': 4, 'journey': 5, 'in': 4, 'learning,': 4, 'Data': 4, 'Science': 4, 'and': 5, 'Artificial': 4, 'Intelligence': 4, 'training!': 4, 'All': 4, 'the': 4, 'best': 4, 'me': 4, 'o!!"': 2, 'property="og:description"': 1, '<link': 3, 'rel="canonical"': 1, 'href="https://toluwee.github.io/"': 1, 'prop

# New Section

## Extracting links using- urllib and BeautifulSoup


In [7]:
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urllib.request.urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, 'html.parser')

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    print(tag.get('href', None))

Enter - https://toluwee.github.io/
/
/about.html
/
https://www.fast.ai
/2020/03/28/My-Journey-So-Far.html
/2020/01/14/welcome.html
/feed.xml
https://github.com/toluwee


In [1]:
import urllib.request, urllib.parse, urllib.error
import json
import ssl

api_key = False
# If you have a Google Places API key, enter it here
# api_key = 'AIzaSy___IDByT70'
# https://developers.google.com/maps/documentation/geocoding/intro

if api_key is False:
    api_key = 42
    serviceurl = 'http://py4e-data.dr-chuck.net/json?'
else :
    serviceurl = 'https://maps.googleapis.com/maps/api/geocode/json?'

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

while True:
    address = input('Enter location: ')
    if len(address) < 1: break

    parms = dict()
    parms['address'] = address
    if api_key is not False: parms['key'] = api_key
    url = serviceurl + urllib.parse.urlencode(parms)

    print('Retrieving', url)
    uh = urllib.request.urlopen(url, context=ctx)
    data = uh.read().decode()
    print('Retrieved', len(data), 'characters')

    try:
        js = json.loads(data)
    except:
        js = None

    if not js or 'status' not in js or js['status'] != 'OK':
        print('==== Failure To Retrieve ====')
        print(data)
        continue

    print(json.dumps(js, indent=4))

    lat = js['results'][0]['geometry']['location']['lat']
    lng = js['results'][0]['geometry']['location']['lng']
    print('lat', lat, 'lng', lng)
    location = js['results'][0]['formatted_address']
    print(location)


KeyboardInterrupt: ignored

## twurl

In [0]:
import urllib.request, urllib.parse, urllib.error
import oauth
import hidden

# https://apps.twitter.com/
# Create App and get the four strings, put them in hidden.py

def augment(url, parameters):
    secrets = hidden.oauth()
    consumer = oauth.OAuthConsumer(secrets['consumer_key'],
                                   secrets['consumer_secret'])
    token = oauth.OAuthToken(secrets['token_key'], secrets['token_secret'])

    oauth_request = oauth.OAuthRequest.from_consumer_and_token(consumer,
                    token=token, http_method='GET', http_url=url,
                    parameters=parameters)
    oauth_request.sign_request(oauth.OAuthSignatureMethod_HMAC_SHA1(),
                               consumer, token)
    return oauth_request.to_url()


def test_me():
    print('* Calling Twitter...')
    url = augment('https://api.twitter.com/1.1/statuses/user_timeline.json',
                  {'screen_name': 'drchuck', 'count': '2'})
    print(url)
    connection = urllib.request.urlopen(url)
    data = connection.read()
    print(data)
    headers = dict(connection.getheaders())
    print(headers)

## hidden

In [0]:
# Keep this file separate

# https://apps.twitter.com/
# Create new App and get the four strings

def oauth():
    return {"consumer_key": "h7Lu...Ng",
            "consumer_secret": "dNKenAC3New...mmn7Q",
            "token_key": "10185562-eibxCp9n2...P4GEQQOSGI",
            "token_secret": "H0ycCFemmC4wyf1...qoIpBo"}