In [1]:
import requests

In [2]:
url = 'http://www.example.com'
resp = requests.get(url)

In [3]:
print(resp.headers)
print(resp.url)

{'Content-Encoding': 'gzip', 'Age': '306957', 'Cache-Control': 'max-age=604800', 'Content-Type': 'text/html; charset=UTF-8', 'Date': 'Tue, 19 Sep 2023 01:42:31 GMT', 'Etag': '"3147526947+gzip"', 'Expires': 'Tue, 26 Sep 2023 01:42:31 GMT', 'Last-Modified': 'Thu, 17 Oct 2019 07:18:26 GMT', 'Server': 'ECS (sab/5693)', 'Vary': 'Accept-Encoding', 'X-Cache': 'HIT', 'Content-Length': '648'}
http://www.example.com/


In [4]:
print(resp.status_code)
print(resp.ok)

200
True


**Common status codes**
- 200: Success
- 401: Unauthorized Error
- 403: Forbidden
- 404: Not Found

In [7]:
# content method returns a bytes object, works great when getting images and pdfs
# which are considered as bytes.
print(resp.content)
print('-'*30)
print(type(resp.content))

b'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    

In [8]:
# use decode to convert it to a string with encoding 'utf-8'
resp.content.decode()

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [9]:
# text method returns a string object, works great with source code itself.
resp.text

'<!doctype html>\n<html>\n<head>\n    <title>Example Domain</title>\n\n    <meta charset="utf-8" />\n    <meta http-equiv="Content-type" content="text/html; charset=utf-8" />\n    <meta name="viewport" content="width=device-width, initial-scale=1" />\n    <style type="text/css">\n    body {\n        background-color: #f0f0f2;\n        margin: 0;\n        padding: 0;\n        font-family: -apple-system, system-ui, BlinkMacSystemFont, "Segoe UI", "Open Sans", "Helvetica Neue", Helvetica, Arial, sans-serif;\n        \n    }\n    div {\n        width: 600px;\n        margin: 5em auto;\n        padding: 2em;\n        background-color: #fdfdff;\n        border-radius: 0.5em;\n        box-shadow: 2px 3px 7px 2px rgba(0,0,0,0.02);\n    }\n    a:link, a:visited {\n        color: #38488f;\n        text-decoration: none;\n    }\n    @media (max-width: 700px) {\n        div {\n            margin: 0 auto;\n            width: auto;\n        }\n    }\n    </style>    \n</head>\n\n<body>\n<div>\n    <

In [10]:
# it is a string, so we can do regular expression
import re
re.findall(r'\b\w+\b', resp.text)

['doctype',
 'html',
 'html',
 'head',
 'title',
 'Example',
 'Domain',
 'title',
 'meta',
 'charset',
 'utf',
 '8',
 'meta',
 'http',
 'equiv',
 'Content',
 'type',
 'content',
 'text',
 'html',
 'charset',
 'utf',
 '8',
 'meta',
 'name',
 'viewport',
 'content',
 'width',
 'device',
 'width',
 'initial',
 'scale',
 '1',
 'style',
 'type',
 'text',
 'css',
 'body',
 'background',
 'color',
 'f0f0f2',
 'margin',
 '0',
 'padding',
 '0',
 'font',
 'family',
 'apple',
 'system',
 'system',
 'ui',
 'BlinkMacSystemFont',
 'Segoe',
 'UI',
 'Open',
 'Sans',
 'Helvetica',
 'Neue',
 'Helvetica',
 'Arial',
 'sans',
 'serif',
 'div',
 'width',
 '600px',
 'margin',
 '5em',
 'auto',
 'padding',
 '2em',
 'background',
 'color',
 'fdfdff',
 'border',
 'radius',
 '0',
 '5em',
 'box',
 'shadow',
 '2px',
 '3px',
 '7px',
 '2px',
 'rgba',
 '0',
 '0',
 '0',
 '0',
 '02',
 'a',
 'link',
 'a',
 'visited',
 'color',
 '38488f',
 'text',
 'decoration',
 'none',
 'media',
 'max',
 'width',
 '700px',
 'div',
 'mar

In [11]:
# add a user agent header and parameter values in our request
# both need to be a dictionary
agent = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}

info = {
    'page' : '2',
    'language' : 'en'
}

resp2 = requests.get(url, headers = agent, params = info)
print(resp2.url)

http://www.example.com/?page=2&language=en


In [2]:
# parameter: verify
# this is to verify the certificate
requests.get('https://www.amazon.com', verify=True)

# code 503, the request is not successfull

<Response [503]>

In [3]:
agent = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
requests.get('https://www.amazon.com', headers = agent, verify=False)



<Response [200]>

In [5]:
# turn off the warning message
requests.packages.urllib3.disable_warnings()

agent = {
    'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36'
}
requests.get('https://www.amazon.com', headers = agent, verify=False)

# always use this structure from now on.

<Response [200]>