# Lesson 5

### Web Scraping with requests

https://pypi.org/project/requests/

https://docs.python-requests.org/en/master/


In [1]:
import requests

In [2]:
help(requests)

Help on package requests:

NAME
    requests

DESCRIPTION
    Requests HTTP Library
    ~~~~~~~~~~~~~~~~~~~~~
    
    Requests is an HTTP library, written in Python, for human beings.
    Basic GET usage:
    
       >>> import requests
       >>> r = requests.get('https://www.python.org')
       >>> r.status_code
       200
       >>> b'Python is a programming language' in r.content
       True
    
    ... or POST:
    
       >>> payload = dict(key1='value1', key2='value2')
       >>> r = requests.post('https://httpbin.org/post', data=payload)
       >>> print(r.text)
       {
         ...
         "form": {
           "key1": "value1",
           "key2": "value2"
         },
         ...
       }
    
    The other HTTP methods are supported - see `requests.api`. Full documentation
    is at <https://requests.readthedocs.io>.
    
    :copyright: (c) 2017 by Kenneth Reitz.
    :license: Apache 2.0, see LICENSE for more details.

PACKAGE CONTENTS
    __version__
    _internal_utils

In [3]:
dir(requests)

['ConnectTimeout',
 'ConnectionError',
 'HTTPError',
 'NullHandler',
 'PreparedRequest',
 'ReadTimeout',
 'Request',
 'RequestException',
 'Response',
 'Session',
 'Timeout',
 'TooManyRedirects',
 'URLRequired',
 '__author__',
 '__author_email__',
 '__build__',
 '__builtins__',
 '__cached__',
 '__cake__',
 '__copyright__',
 '__description__',
 '__doc__',
 '__file__',
 '__license__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__title__',
 '__url__',
 '__version__',
 '_check_cryptography',
 '_internal_utils',
 'adapters',
 'api',
 'auth',
 'certs',
 'chardet',
 'check_compatibility',
 'codes',
 'compat',
 'cookies',
 'delete',
 'exceptions',
 'get',
 'head',
 'hooks',
 'logging',
 'models',
 'options',
 'packages',
 'patch',
 'post',
 'put',
 'request',
 'session',
 'sessions',
 'ssl',
 'status_codes',
 'structures',
 'urllib3',
 'utils',

### An example based on "Text-only websites"


https://sjmulder.nl/en/textonly.html 
--> This is a directory of websites that primarily stick with simple, marked up, hyperlinked text.



In [4]:
page = "https://plumebio.com/"

In [5]:
r = requests.get(page)

In [6]:
print(r.text)

<!DOCTYPE html>
<html lang="en">
<head>
    <meta name="referrer" content="origin">
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1">
    <meta name="description" content="A simple and friendly personal bio to express who you are, what you do, and give others a safe way to contact you.">
    <meta name="author" content="PlumeBio">
    <title>PlumeBio - write your personal bio</title>
    <link rel="stylesheet" href="/style.css">
    <link rel="shortcut icon" href="/favicon.ico">

    </head>
<body>
    <div class="header">
        <a href="/" style="text-decoration: none;">
            <div class="logo">
                <svg class="logo-icon" height='20px' width='20px'  fill="#000000" xmlns="http://www.w3.org/2000/svg" viewBox="0 0 100 100" x="0px" y="0px"><path fill="#084468" d="M14,98H74A12,12,0,0,0,86,86V74a4,4,0,0,0-8,0V86a4,4,0,0,1-4,4H14a4,4,0,0,1-4-4V26a4,4,0,0,1,4-4H26a4,4,0,0,0,0-8H14A12,12,0,0,0,2,26V86A12,12,

In [None]:
page = "https://www.freesoft.org/CIE/Topics/57.htm"

In [None]:
r = requests.get(page)

In [None]:
r.url

In [None]:
fullHtmlText = r.text

In [None]:
print(fullHtmlText)

### An example based on the GitHub REST API


https://docs.github.com/en/rest/reference/users

In [None]:
# default / example user, with authentication

r = requests.get('https://api.github.com/users/user', auth = ('user', 'pass'))

https://github.com/user

Ops, she is a real human being ...
Well, choosing "user" as alias maybe is not the best choice ever ;-D

In [None]:
r.text

In [None]:
r.status_code

https://en.wikipedia.org/wiki/List_of_HTTP_status_codes

In [None]:
if r.status_code == 200:
    print('Success!')
elif r.status_code == 401:
    print('Unauthorized')
elif r.status_code == 404:
    print('Not Found')

In [None]:
if r:
    print('Success!')
else:
    print('An error has occurred')

In [None]:
r.json()

# the type of the return value of .json() is a dictionary, so you can access values in the object by key

In [None]:
json_response = r.json()
json_response['location']

In [None]:
r.encoding

In [None]:
r.headers

In [None]:
r.headers['Last-Modified']

In [None]:
# the end