# Web Scraping 101 - From ZERO to HERO

* Lets look at extracting html

In [3]:
#import packages
import urllib.request, urllib.parse, urllib.error
import bs4

In [19]:
# Reading a Text File
import urllib.request

fhand = urllib.request.urlopen('http://data.pr4e.org/romeo.txt')\
 
# reads line by line
#for line in fhand:
#    print(line.decode().strip())

# reads line by line and counts the words
# Word count
counts = dict()
for line in fhand:
    words = line.decode().split()
    for word in words:
        counts[word] = counts.get(word, 0) + 1
print(counts)

the clown ran after the car and the car ran into the tent and the tent fell down on the clown and the car



AttributeError: 'str' object has no attribute 'decode'

In [6]:
# Reading a html page
import urllib.request, urllib.parse, urllib.error

fhand = urllib.request.urlopen('http://www.dr-chuck.com/page1.htm')
for line in fhand:
    print(line.decode().strip())
    

URLError: <urlopen error [WinError 10060] A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond>

In [None]:
# Read a html page and extract its 'a' anchor tags
from urllib.request import urlopen
from bs4 import BeautifulSoup
import ssl

# Ignore SSL certificate errors
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

url = input('Enter - ')
html = urlopen(url, context=ctx).read()
soup = BeautifulSoup(html, "html.parser")

# Retrieve all of the anchor tags
tags = soup('a')
for tag in tags:
    # Look at the parts of a tag
    print('TAG:', tag)
    print('URL:', tag.get('href', None))
    print('Contents:', tag.contents[0])
    print('Attrs:', tag.attrs)

In [4]:
# Reads a url and its all sublinks
import urllib.request, urllib.parse, urllib.error
from bs4 import BeautifulSoup
import ssl

# ignore SSL certificate error for https
ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

todo = list()
visited = list()
url = input('Enter -')
todo.append(url)
count = int(input('How many to retrieve? - '))

while len(todo) > 0  and count > 0 :
    print("==== To Retrieve:", count, "Queue Length:", len(todo))
    url = todo.pop()
    count = count - 1
    
    if (not url.startswith('http')):
        print('Skipping ', url)
        continue
        
    if (url.find('facebook')> 0 ):
        continue
        
    if (url.find('linkedin') > 0):
        continue
        
    if (url in visited):
        print("Visited", url)
        continue
    
    print("===Retrieveing", url)
    
    try:
        html = urllib.request.urlopen(url, context=ctx).read()
    except:
        print("*** Error in retrieval")
        continue
        
    soup = BeautifulSoup(html,'html.parser')
    visited.append(url)
    
    # Retrieve all of the anchor tags
    tags = soup('a')
    for tag in tags:
        newurl = tag.get('href',None)
        if (newurl is not None):
            todo.append(newurl)

## Lets look at extracting xml

In [9]:
import xml.etree.ElementTree as ET

input = '''
<stuff>
    <users>
        <user x="2">
            <id>001</id>
            <name>Chuck</name>
        </user>
        <user x="7">
            <id>009</id>
            <name>Brent</name>
        </user>
    </users>
</stuff>'''

stuff = ET.fromstring(input)
lst = stuff.findall('users/user')
print('User count:', len(lst))

for item in lst:
    print('Name', item.find('name').text)
    print('Id', item.find('id').text)
    print('Attribute', item.get('x'))
    print('====')

User count: 2
Name Chuck
Id 001
Attribute 2
====
Name Brent
Id 009
Attribute 7
====


## Lets look at json webpage

In [55]:
# extract data from json, which has dictionary
import json
data = '''
{
  "name" : "Chuck",
  "phone" : {
    "type" : "intl",
    "number" : "+1 734 303 4456"
   },
   "email" : {
     "hide" : "yes"
   }
}'''
    
info = json.loads(data)
print('Name:',info['name'])
print('Hide:',info['email']['hide'])


Name: Chuck
Hide: yes


In [56]:
# extract data from json which has dictionaries in array
import json

data = '''
[
  { "id" : "001",
    "x" : "2",
    "name" : "Chuck"
  } ,
  { "id" : "009",
    "x" : "7",
    "name" : "Chuck"
  }
]'''

info = json.loads(data)
print('User count:', len(info))

for item in info:
    print('Name', item['name'])
    print('Id', item['id'])
    print('Attribute', item['x'])

User count: 2
Name Chuck
Id 001
Attribute 2
Name Chuck
Id 009
Attribute 7


In [57]:
class PartyAnimal:
   x = 0
   name = ''
   def __init__(self, nam):
     self.name = nam
     print(self.name,'constructed')

   def party(self) :
     self.x = self.x + 1
     print(self.name,'party count',self.x)

s = PartyAnimal('Sally')
j = PartyAnimal('Jim')

s.party()
j.party()
s.party()

Sally constructed
Jim constructed
Sally party count 1
Jim party count 1
Sally party count 2


In [60]:
# Search for lines that contain 'From'
import re
hand = open('mbox-short.txt')
for line in hand:
    line = line.rstrip()
    if re.search('From:', line):
        print(line)

FileNotFoundError: [Errno 2] No such file or directory: 'mbox-short.txt'

## Lets look at API

# Lets look at reading a text file

In [40]:
# read a text file and do the work count
import string
ftext = open('./code3/romeo-full.txt')
 
counts = dict()
for line in ftext:
    line = line.translate(str.maketrans('','',string.punctuation))
    line = line.lower()
    words = line.split()
    for word in words:
        if word in counts:
            counts[word] += 1
        else:
            counts[word] = 1
        
#sort the dictionary based on the count
lst = list()
for key, val in list(counts.items()):
    lst.append((val,key))

lst.sort(reverse = True)

for key, val in lst[:10]:
    print(key,val)

61 i
42 and
40 romeo
34 to
34 the
32 thou
32 juliet
30 that
29 my
24 thee


In [51]:
# sql Usage in Python SQLLITE3
import sqlite3

conn = sqlite3.connect('music.sqlite')
cur = conn.cursor()

cur.execute('DROP TABLE IF EXISTS Tracks')
cur.execute('CREATE TABLE Tracks (title TEXT, plays INTEGER)')
cur.execute('INSERT INTO Tracks (title, plays) VALUES (?, ?)', ('Thunderstruck', 20))
cur.execute('INSERT INTO Tracks (title, plays) VALUES (?,?)', ('Life goes on',21))
conn.commit()
            
print('Tracks')
cur.execute('SELECT title, plays FROM Tracks')
for row in cur:
            print(row)
            
cur.execute('DELETE FROM Tracks WHERE plays < 100')
conn.close()



Tracks
('Thunderstruck', 20)
('Life goes on', 21)
