## Different source files

In [13]:
import nltk
nltk.download('punkt')
import urllib.parse
import urllib.request
#from urllib.request import Request, urlopen
from urllib.error import URLError
from bs4 import BeautifulSoup

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
# Table mapping response codes to messages; entries have the
# form {code: (shortmessage, longmessage)}.
responses = {
    100: ('Continue', 'Request received, please continue'),
    101: ('Switching Protocols',
          'Switching to new protocol; obey Upgrade header'),

    200: ('OK', 'Request fulfilled, document follows'),
    201: ('Created', 'Document created, URL follows'),
    202: ('Accepted',
          'Request accepted, processing continues off-line'),
    203: ('Non-Authoritative Information', 'Request fulfilled from cache'),
    204: ('No Content', 'Request fulfilled, nothing follows'),
    205: ('Reset Content', 'Clear input form for further input.'),
    206: ('Partial Content', 'Partial content follows.'),

    300: ('Multiple Choices',
          'Object has several resources -- see URI list'),
    301: ('Moved Permanently', 'Object moved permanently -- see URI list'),
    302: ('Found', 'Object moved temporarily -- see URI list'),
    303: ('See Other', 'Object moved -- see Method and URL list'),
    304: ('Not Modified',
          'Document has not changed since given time'),
    305: ('Use Proxy',
          'You must use proxy specified in Location to access this '
          'resource.'),
    307: ('Temporary Redirect',
          'Object moved temporarily -- see URI list'),

    400: ('Bad Request',
          'Bad request syntax or unsupported method'),
    401: ('Unauthorized',
          'No permission -- see authorization schemes'),
    402: ('Payment Required',
          'No payment -- see charging schemes'),
    403: ('Forbidden',
          'Request forbidden -- authorization will not help'),
    404: ('Not Found', 'Nothing matches the given URI'),
    405: ('Method Not Allowed',
          'Specified method is invalid for this server.'),
    406: ('Not Acceptable', 'URI not available in preferred format.'),
    407: ('Proxy Authentication Required', 'You must authenticate with '
          'this proxy before proceeding.'),
    408: ('Request Timeout', 'Request timed out; try again later.'),
    409: ('Conflict', 'Request conflict.'),
    410: ('Gone',
          'URI no longer exists and has been permanently removed.'),
    411: ('Length Required', 'Client must specify Content-Length.'),
    412: ('Precondition Failed', 'Precondition in headers is false.'),
    413: ('Request Entity Too Large', 'Entity is too large.'),
    414: ('Request-URI Too Long', 'URI is too long.'),
    415: ('Unsupported Media Type', 'Entity body in unsupported format.'),
    416: ('Requested Range Not Satisfiable',
          'Cannot satisfy request range.'),
    417: ('Expectation Failed',
          'Expect condition could not be satisfied.'),

    500: ('Internal Server Error', 'Server got itself in trouble'),
    501: ('Not Implemented',
          'Server does not support this operation'),
    502: ('Bad Gateway', 'Invalid responses from another server/proxy.'),
    503: ('Service Unavailable',
          'The server cannot process the request due to a high load'),
    504: ('Gateway Timeout',
          'The gateway server did not receive a timely response'),
    505: ('HTTP Version Not Supported', 'Cannot fulfill request.'),
    }

Some websites [1] dislike being browsed by programs, or send different versions to different browsers [2]. By default urllib identifies itself as Python-urllib/x.y (where x and y are the major and minor version numbers of the Python release, e.g. Python-urllib/2.5), which may confuse the site, or just plain not work. The way a browser identifies itself is through the User-Agent header [3]. When you create a Request object you can pass a dictionary of headers in. The following example makes the same request as above, but identifies itself as a version of Internet Explorer [4].

In [15]:
#url = "https://en.wikipedia.org/wiki/Python_(programming_language)"
url = "https://www.cnet.com/news/uber-air-flying-cars-are-closer-than-you-think"
user_agent = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'
values = {'name': 'Michael Foord',
          'location': 'Northampton',
          'language': 'Python' }
headers = {'User-Agent': user_agent}
data = urllib.parse.urlencode(values)
data = data.encode('ascii')

In [16]:
req = urllib.request.Request(url, data, headers)

try:
    #urllib.request.urlopen(req)
    with urllib.request.urlopen(req) as response:
        html = response.read()
except URLError as e:
    print('The server couldn\'t fulfill the request.')
    print('Error code: ', responses[e.code])
except URLError as e:
    print('We failed to reach a server.')
    print('Reason: ', e.reason)
else:
    print("everything is fine")

everything is fine


In [17]:
#html

## We will use a Python library called BeautifulSoup in order to strip away the HTML code.

In [18]:
web_str = BeautifulSoup(html, "lxml").get_text()

In [19]:
web_tokens = nltk.word_tokenize(web_str)
len(web_tokens)

16923

In [20]:
web_tokens[0:25]

['How',
 'Uber',
 'is',
 'getting',
 'flying',
 'cars',
 'off',
 'the',
 'ground',
 '-',
 'CNET',
 '{',
 '``',
 '@',
 'context',
 "''",
 ':',
 "''",
 'https',
 ':',
 '\\/\\/schema.org',
 "''",
 ',',
 "''",
 '@']

In [21]:
start = web_str.find("It's 6 p.m. in Tokyo and my flying car is late")

In [22]:
end = web_str.find("Because I might have seen the brave new world of transport in Tokyo.")

In [23]:
last_sent = len("Because I might have seen the brave new world of transport in Tokyo.")

In [24]:
intro = web_str[start:end+last_sent]

In [25]:
intro_tokens = nltk.word_tokenize(intro)

In [26]:
print (intro_tokens)

['It', "'s", '6', 'p.m.', 'in', 'Tokyo', 'and', 'my', 'flying', 'car', 'is', 'late', '.', 'Three', 'years', 'late.Back', 'to', 'the', 'Future', 'promised', 'me', 'flying', 'cars', '(', 'and', 'hoverboards', ')', 'by', '2015', '.', 'Yet', 'here', 'I', 'am', 'in', '2018', ',', 'standing', 'in', 'one', 'of', 'the', 'world', "'s", 'most', 'high-tech', 'cities', 'and', 'I', 'have', 'to', 'walk', '.', 'I', 'do', "n't", 'even', 'get', 'to', 'do', 'it', 'in', 'self-lacing', 'shoes.I', "'m", 'in', 'Tokyo', 'for', 'Uber', 'Elevate', ',', 'Uber', "'s", 'third', 'conference', 'outlining', 'its', 'plans', 'to', 'get', 'flying', 'cars', 'off', 'the', 'silver', 'screen', 'and', 'into', 'our', 'skies', 'in', 'as', 'little', 'as', 'two', 'years', '.', 'It', "'s", 'a', 'lofty', 'ambition', ',', 'but', 'Uber', 'has', 'partnered', 'with', 'some', 'big', 'names', 'in', 'aviation', 'and', 'picked', 'up', 'its', 'share', 'of', 'NASA', 'alumni', 'to', 'help', 'it', 'get', 'there.The', 'goal', '?', 'UberAir', 