In [30]:
import mysocket as sock
import requests
import re

protocol = "http"
host = "hadoop.mathsci.denison.edu"
port = 80
buildURL = lambda s: "{}://{}:{}/{}".format(protocol, host, port, s)

In [31]:
def printByLine(s):
    pattern = r'^(.*)$'
    for match in re.finditer(pattern, s, flags=re.M):
        if match[1] and len(match[1])>0:
            print(repr(match[1]+'\n'))


In [13]:
requestlines = ["GET /bookweb/data/basic.html HTTP/1.1",
                "Host: hadoop.mathsci.denison.edu",
                "Connection: close",
                ""
               ]
request = "\r\n".join(requestlines) + "\r\n"
request

'GET /bookweb/data/basic.html HTTP/1.1\r\nHost: hadoop.mathsci.denison.edu\r\nConnection: close\r\n\r\n'

In [14]:
printByLine(request)

'GET /bookweb/data/basic.html HTTP/1.1\r\n'
'Host: hadoop.mathsci.denison.edu\r\n'
'Connection: close\r\n'
'\r\n'


In [15]:
webconn = sock.makeConnection("hadoop.mathsci.denison.edu", 80)
sock.sendString(webconn, request)
s = sock.receiveTillClose(webconn)

gaierror: [Errno -2] Name or service not known

In [6]:
webconn = sock.makeConnection("hadoop.mathsci.denison.edu", 80)
for line in requestlines:
    sock.sendString(webconn, line)
    sock.sendCRLF(webconn)
s = sock.receiveTillClose(webconn)

gaierror: [Errno -2] Name or service not known

In [None]:
s

In [None]:
printByLine(s)

In [None]:
url = buildURL("bookweb/data/basic.html")
url

In [None]:
response = requests.get(url)

In [None]:
type(response.status_code)

In [None]:
response.text

In [None]:
response.content

In [None]:
response.headers

In [None]:
D = response.headers

In [None]:
D['content-type']

In [None]:
request = response.request

In [None]:
request.headers

In [None]:
requestHeaders = {'Accept':'text/html'}
r = requests.get(url, headers=requestHeaders)

In [None]:
r.status_code

In [None]:
request = r.request
request.headers

## Student Practice

Do each of the following first from web browser, then from requests module.

- simple http get request of html from root of a location
- http get request of html from non-root (but default)
- http get request of html from non-root non-default
- request for non-existant resource
- http get request of csv

More advanced HTTP operations
- http get request with parameters
- http post request with parameters
- request for redirected resource
- request for protected resource

## Programmatic HTTP `GET` request

[`requests` module documentation](https://requests.readthedocs.io/en/master/)

In [None]:
import requests

Based on  our newfound understanding of a basic URL consisting of parts of a protocol, a location, and a resource identifier, we use separate variables for each and compose them using a format method on a template string:

### Basic request of URL from root of a location

In [17]:
protocol = 'http://'
location = 'personal.denison.edu'
resource = '/'

template = '{}{}{}'
url = template.format(protocol, location, resource)

url

'http://personal.denison.edu/'

The simplest use of the `requests` module is perform a `GET` method using the `get()` function of the module, with a simple single argument of a string specifying the URL.

In [18]:
resp = requests.get(url)

**Whenever** we make a request, we **must** check for a successful return by looking at the `status_code` attribute of the response object.  If we get a non-successful code, our programming must either fix a bug, or recover from the error, informing the user of the problem.

In [19]:
if resp.status_code != 200:
    print('Error retieving request from', url)

In many cases, before we act on the **message body**, which is the **content** on what has been retrieved from the server, we want or need to know about additional metadata, like:
- How large is the content
- What type of data has been returned (html, csv, xml, xhtml, json, etc.)
- other information such as date, server, etc.

The `headers` attribute of the response object contains a dictionary containing all the meta-data information provided by the server that came from the header field part of the HTTP response.  Note that HTTP does not know about Python dictionaries ... the transformation/translation from the HTTP response, which is just a set of characters in the underlying data stream connection between the client and the server, happens because of the `requests` module. 

In [10]:
resp.headers

{'Date': 'Fri, 22 Oct 2021 02:27:36 GMT', 'Server': 'Apache/2.4.6 (CentOS)', 'Accept-Ranges': 'bytes', 'Content-Length': '464', 'Connection': 'close', 'Content-Type': 'text/html; charset=iso-8859-1'}

Finally, we can access the data of the response in two ways.  If it is textual data, and we want the string version of the data (appropriate to the encoding), we use the `text` attribute of the response object.

In [None]:
resp.text

While if the data is binary (like an image, or a compiled program), or if we want the form without applying the encoding, we use the `content` attribute of the response object.

In [None]:
resp.content

Notice the `b` prefix in the demonstration above, indicating that these are raw bytes.

A `requests` module response object also has an attribute (`request`) that references the original request corresponding to the this particular response.  Some of the request's interesting attributes include:
- `headers`
- `path-url`
- `url`
- `method`
- `body`

In [None]:
resp.request.headers

In [None]:
resp.request.path_url

In [None]:
resp.request.url

In [None]:
resp.request.method

In [None]:
resp.request.body

### Basic URL request to non-root of a location

We can change our resource to request the HTML using a resource identifier other than the root:

In [None]:
resource = '/~kretchmar/'

template = '{}{}{}'
url = template.format(protocol, location, resource)

url

**A1** Add code to retrieve from this new url and then print the **length** of the content returned and print the first 100 characters in the body of the returned HTML.

**A2** Using Chrome, copy the HTTP raw request (the GET, including all header lines), and then copy the response, from the response line through the response headers.

### URL request to a non-root resource identifier that does not exist

**B1** Now change the resource to something that does not exist, and demonstrate how your code can detect when something does not exist and print an appropriate error message to the user.

**B2** Using chrome, copy and paste the HTTP raw request and also the HTTP raw response corresponding to exactly the same request made above.

### URL request to a non-HTML, non-default resource identified location

The goal here is to find a resource, say within `discovercs.denison.edu` that gives us a URL for an explicit file provided by a Data System provider.  So look for a csv or text file and determine the URL.  Then give two cells below, one which requests the resource programmatically and uses the headers and content to print information about the response (size, type) and some of its data body.  Don't forget to include code to check the status return as well.

The second cell should again use Chrome to show and help understand the HTTP request and response going on.

### URL request to a JSON that uses URL parameters

Resources that we request of a Data System provider may not always be for an existing file (text or otherwise).  Just like when we make requests to an SQL server and specify (through our SQL SELECT query) some part of the data we are interested in, and the server processes from the stored files and tables and returns just the requested subset, we need such dynamic flexibility in the Data System providers that use HTTP.

This means that we need to pass additional information through HTTP, but still adhere to the methods/verbs that the protocol provides.  One means of passing additional information is to extend the use of our URL to include one or more key-value pairs.  These can be used by the server/provider, which can dynamically create our response.

In [32]:
import util

In [53]:
url = "http://datasystems.denison.edu/basic.html"

In [54]:
response = requests.get(url)
request = response.request

In [55]:
print("Response status:", response.status_code)

Response status: 200


In [56]:
print("URL:", request.url)

URL: http://datasystems.denison.edu/basic.html


In [57]:
url = "http://httpbin.org/get"
headerD = {
"Accept": "application/json",
"User-Agent": "datasystems-client"
}
response = requests.get(url, headers=headerD)
request = response.request

In [58]:
print("Response status:", response.status_code)

Response status: 200


In [59]:
data = json.loads(response.text)
data

{'args': {},
 'headers': {'Accept': 'application/json',
  'Accept-Encoding': 'gzip, deflate',
  'Host': 'httpbin.org',
  'User-Agent': 'datasystems-client',
  'X-Amzn-Trace-Id': 'Root=1-61723f2e-4e68f1cc122b83b8202dfe9d'},
 'origin': '14.182.14.80',
 'url': 'http://httpbin.org/get'}

In [60]:
url = "http://httpbin.org/get"
paramsD = {
"user": "smith",
"query": "movies tv"
}
response = requests.get(url, params=paramsD)
request = response.request

In [61]:
print("Response status:", response.status_code)

Response status: 200


In [62]:
print("Path:", request.path_url)

Path: /get?user=smith&query=movies+tv


In [63]:
data = json.loads(response.text)
data

{'args': {'query': 'movies tv', 'user': 'smith'},
 'headers': {'Accept': '*/*',
  'Accept-Encoding': 'gzip, deflate',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.26.0',
  'X-Amzn-Trace-Id': 'Root=1-61723f50-23d463bb7bdbc6371b25a33b'},
 'origin': '14.182.14.80',
 'url': 'http://httpbin.org/get?user=smith&query=movies+tv'}

In [65]:
url = "http://httpbin.org/post"

In [66]:
paramsD = {
"user": "jones",
"query": "TV?episodes"
}
headerD = {
"Accept": "application/json"
}
body = {"a": 1, "b": 2}
response = requests.post(url, params=paramsD,
headers=headerD, data=body)
request = response.request
print("Response status:", response.status_code)

Response status: 200


In [67]:
print("Request Path:", request.path_url)

Request Path: /post?user=jones&query=TV%3Fepisodes


In [68]:
print("Request Body:", request.body)

Request Body: a=1&b=2


In [69]:
response

<Response [200]>

In [70]:
data = json.loads(response.text)
data

{'args': {'query': 'TV?episodes', 'user': 'jones'},
 'data': '',
 'files': {},
 'form': {'a': '1', 'b': '2'},
 'headers': {'Accept': 'application/json',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '7',
  'Content-Type': 'application/x-www-form-urlencoded',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.26.0',
  'X-Amzn-Trace-Id': 'Root=1-61723f73-39783cd75f33ec795cc60193'},
 'json': None,
 'origin': '14.182.14.80',
 'url': 'http://httpbin.org/post?user=jones&query=TV%3Fepisodes'}

In [44]:
import json
url = "http://httpbin.org/post"
paramsD = {
"user": "jones",
"query": "TV"
}
headerD = {
"Accept": "application/json"
}
json_data = ["foo", "bar", {"a": 1, "b": 2}]
response = requests.post(url, params=paramsD,
headers=headerD, json=json_data)
request = response.request
print("Response status:", response.status_code)

Response status: 200


In [45]:
print("Request Path:", request.path_url)

Request Path: /post?user=jones&query=TV


In [47]:
print("Request Body:", request.body.decode('utf-8'))

Request Body: ["foo", "bar", {"a": 1, "b": 2}]


In [52]:
data = json.loads(response.text)
data


{'args': {'query': 'TV', 'user': 'jones'},
 'data': '["foo", "bar", {"a": 1, "b": 2}]',
 'files': {},
 'form': {},
 'headers': {'Accept': 'application/json',
  'Accept-Encoding': 'gzip, deflate',
  'Content-Length': '32',
  'Content-Type': 'application/json',
  'Host': 'httpbin.org',
  'User-Agent': 'python-requests/2.26.0',
  'X-Amzn-Trace-Id': 'Root=1-61723d37-1895dcbe651d96920d1b5c3f'},
 'json': ['foo', 'bar', {'a': 1, 'b': 2}],
 'origin': '14.182.14.80',
 'url': 'http://httpbin.org/post?user=jones&query=TV'}