# urllib.request

### Task 1 - Get my Public IP address

In [3]:
import urllib.request

In [4]:
connection = urllib.request.urlopen("http://checkip.dyndns.org")

In [5]:
print(
    f"""
    {connection.url    = }
    {connection.status = }
    {connection.reason = }
    {connection.msg    = }
    {connection.code   = }
"""
)


    connection.url    = 'http://checkip.dyndns.org'
    connection.status = 200
    connection.reason = 'OK'
    connection.msg    = 'OK'
    connection.code   = 200



In [6]:
data = connection.read()

data

b'<html><head><title>Current IP Check</title></head><body>Current IP Address: 149.71.229.193</body></html>\r\n'

In [7]:
data = data.decode("utf-8")
print(type(data))

<class 'str'>


In [8]:
data

'<html><head><title>Current IP Check</title></head><body>Current IP Address: 149.71.229.193</body></html>\r\n'

#### Parsing the html 
    Method 1 - bruteforce

In [9]:
data.split("Current IP Address: ")[1]

'149.71.229.193</body></html>\r\n'

In [10]:
my_public_ip = data.split("Current IP Address: ")[1].split("</body>")[0]
print("My Public IP Address is: ", my_public_ip)

My Public IP Address is:  149.71.229.193


Method 2 - Using regular Expressions

In [11]:
import re

In [12]:
my_public_ip = re.findall(r"\d{1,3}.\d{1,3}.\d{1,3}.\d{1,3}", data)[0]
print("My Public IP Address is: ", my_public_ip)

My Public IP Address is:  149.71.229.193


### Task 2 - Get all valid weblinks from a website

In [13]:
import urllib.request

In [14]:
connection = urllib.request.urlopen("https://www.stackoverflow.com")

In [15]:
print(
    f"""
    {connection.url    = }
    {connection.status = }
    {connection.reason = }
    {connection.msg    = }
    {connection.code   = }
"""
)


    connection.url    = 'https://stackoverflow.com/'
    connection.status = 200
    connection.reason = 'OK'
    connection.msg    = 'OK'
    connection.code   = 200



In [16]:
data = connection.read()

In [17]:
data



In [18]:
with open("stackoverflow.html", "wb") as f:
    f.write(data)
    f.close()

### Parsing html data

In [19]:
import lxml.html

dom = lxml.html.fromstring(data)

In [20]:
dom  # Document Object Model, in html

<Element html at 0x2516d395f40>

#### select the url in href for all a tags(links)

In [21]:
for link in dom.xpath("//a/@href"):
    if str(link).startswith("https"):
        print(link)

https://stackoverflow.com
https://stackoverflow.co/
https://stackoverflow.co/talent
https://stackoverflow.co/advertising
https://stackoverflow.co/
https://stackoverflow.com
https://stackoverflow.com
https://stackoverflow.com/help
https://chat.stackoverflow.com/?tab=site&host=stackoverflow.com
https://meta.stackoverflow.com
https://stackoverflow.com/users/signup?ssrc=site_switcher&returnurl=https%3a%2f%2fstackoverflow.com%2f
https://stackoverflow.com/users/login?ssrc=site_switcher&returnurl=https%3a%2f%2fstackoverflow.com%2f
https://stackexchange.com/sites
https://stackoverflow.blog
https://stackoverflow.com/users/login?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f
https://stackoverflow.com/users/signup?ssrc=head&returnurl=https%3a%2f%2fstackoverflow.com%2f
https://stackoverflow.com/jobs/companies?so_medium=stackoverflow&so_source=SiteNav
https://try.stackoverflow.co/why-teams/?utm_source=so-owned&utm_medium=side-bar&utm_campaign=campaign-38&utm_content=cta
https://stackoverflo

### Task 3 - Get Image data

In [22]:
URL = "https://http.cat/200"
connection = urllib.request.Request(URL)

In [23]:
print(
    f"""
    {connection.unverifiable      =}
    {connection.type              =}
    {connection.full_url          =}
    {connection.host              =}
    {connection.origin_req_host   =}
    
    {connection.get_method()      =}
    {connection.headers           =}
    {connection.header_items()    =}
    {connection.unredirected_hdrs =}
"""
)


    connection.unverifiable      =False
    connection.type              ='https'
    connection.full_url          ='https://http.cat/200'
    connection.host              ='http.cat'
    connection.origin_req_host   ='http.cat'
    
    connection.get_method()      ='GET'
    connection.headers           ={}
    connection.header_items()    =[]
    connection.unredirected_hdrs ={}



In [24]:
connection.data

In [25]:
URL = "https://http.cat/200.jpg"
connection = urllib.request.urlopen(URL)

In [26]:
print(
    f"""
    {connection.url    = }
    {connection.status = }
    {connection.reason = }
    {connection.msg    = }
    {connection.code   = }
"""
)


    connection.url    = 'https://http.cat/200.jpg'
    connection.status = 200
    connection.reason = 'OK'
    connection.msg    = 'OK'
    connection.code   = 200



In [27]:
data = connection.read()

In [28]:
print(dir(connection))

['__abstractmethods__', '__class__', '__del__', '__delattr__', '__dict__', '__dir__', '__doc__', '__enter__', '__eq__', '__exit__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__iter__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__next__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '_abc_impl', '_checkClosed', '_checkReadable', '_checkSeekable', '_checkWritable', '_check_close', '_close_conn', '_get_chunk_left', '_method', '_peek_chunked', '_read1_chunked', '_read_and_discard_trailer', '_read_chunked', '_read_next_chunk_size', '_read_status', '_readinto_chunked', '_safe_read', '_safe_readinto', 'begin', 'chunk_left', 'chunked', 'close', 'closed', 'code', 'debuglevel', 'detach', 'fileno', 'flush', 'fp', 'getcode', 'getheader', 'getheaders', 'geturl', 'headers', 'info', 'isatty', 'isclosed', 'length', 'msg', 'peek', 'read', 'read1', 'readable',

In [29]:
connection.headers

<http.client.HTTPMessage at 0x2516c85c690>

In [30]:
type(data)

bytes

In [31]:
with open("cat.jpg", "wb") as fh:
    fh.write(data)