In [7]:
from urllib.request import Request, urlopen
import requests
from bs4 import BeautifulSoup
import csv

In [8]:
import pandas as pd
import numpy as np

In [9]:
# not necessary to import these for this exercise but time.sleep(3) gives a pause of 3 seconds. 
# necessary when you are crawling a lot of data - you need to pause in between crawls 
# so they don't ban you  
import os
import sys
import time
time.sleep(3)

In [None]:
# if you want to know where your packages are installed at
sys.path

In [None]:
# if you want to know what packages you have
# could take some time to load
help("modules")

### Example 1 

In [None]:
site = "http://edition.cnn.com/travel/article/most-visited-cities-euromonitor-2018/index.html"
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site)
soup = BeautifulSoup(bookpage.text, "lxml")

In [None]:
print(soup.prettify)

In [None]:
soup.find_all('div', class_="Paragraph__component")

In [None]:
soup.find_all('div', class_="Paragraph__component")[23]

In [None]:
# dropping html tags with .get_text()
# the output has all the information - I want to get the variables part by part from this output
# i.e. rank, country, count of arrivals in 2017, count of arrivals in 2018
soup.find_all('div', class_="Paragraph__component")[23].get_text()

In [None]:
state = soup.find_all('div', class_="Paragraph__component")[23].get_text()
# i get the part before the '.'. before is represented by 0 in the [].
rank = state.split('.')[0]
rank

In [None]:
# i get the city after the '.' and before the ':'. 
# before is represented by 0 in the [] and after is represented by 1.
city = state.split('. ')[1].split(':')[0]
city

In [None]:
# i get the arrival count in 2017 by taking the part after ':' and before ' arrivals (2017)'.
# before is represented by 0 in the [] and after is represented by 1.
arrivals_2017 = state.split(': ')[1].split(' arrivals (2017)')[0]
arrivals_2017

In [None]:
# i get the arrival count in 2018 by taking the part after '/' and before ' arrivals (2018)'.
# before is represented by 0 in the [] and after is represented by 1.
arrivals_2018 = state.split('/ ')[1].split(' arrivals (2018)')[0]
arrivals_2018

In [None]:
# i realised that the arrival counts are not in numeric but string format
# i split them up into three elements in a list
arrivals_2017.split(',')

In [None]:
# i take the first number in the list and multiply by 1000000
int(arrivals_2017.split(',')[0])*1000000

In [None]:
# i take the second number in the list and multiply by 1000
int(arrivals_2017.split(',')[1])*1000

In [None]:
arrivals = []
for i in range(23,43):
    state = soup.find_all('div', class_="Paragraph__component")[i].get_text()
    rank = state.split('.')[0]
    cc = state.split('. ')[1].split(':')[0]
    try:
        city = cc.split(', ')[0]
    except:
        city = cc
    try: 
        country = cc.split(', ')[1]
    except:
        country = city
    arrivals_2017 = state.split(': ')[1].split(' arrivals (2017)')[0]
    arrivals_2017 = int(int(arrivals_2017.split(',')[0])*1000000 + int(arrivals_2017.split(',')[1])*1000 + int(arrivals_2017.split(',')[2]))
    arrivals_2018 = state.split('/ ')[1].split(' arrivals (2018)')[0]
    arrivals_2018 = int(int(arrivals_2018.split(',')[0])*1000000 + int(arrivals_2018.split(',')[1])*1000 + int(arrivals_2018.split(',')[2]))
    
    arrivals.append((rank, city, country, arrivals_2017, arrivals_2018))

In [None]:
df = pd.DataFrame(np.array(arrivals))
df.columns = ['rank', 'city', 'country', 'arrivals_2017', 'arrivals_2018']

In [None]:
df

In [None]:
# somehow Python still doesn't recognise the values as integers  
df['arrivals_2017'].describe()

In [None]:
df[["arrivals_2017", "arrivals_2018"]] = df[["arrivals_2017", "arrivals_2018"]].apply(pd.to_numeric)

In [None]:
# now the variable is in numeric format
df['arrivals_2017'].describe()

In [None]:
# now i can do some data manipulation
df['Diff'] = df['arrivals_2018'] - df['arrivals_2017'] 
df

In [None]:
# extra: if i want to print a number with comma separator
print("Formatted Number with comma separator: "+"{:,}".format(3000));

In [None]:
df.to_csv('arrivals2018.csv')

### Example 2

In [None]:
site = "https://www.businessinsider.sg/highest-paying-jobs-america-best-jobs-millennials-2017-10/"
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site)
soup = BeautifulSoup(bookpage.text, "lxml")

In [None]:
soup.find_all('h3')

In [None]:
soup.find_all('p')

In [None]:
soup.find_all('p')[10]

In [None]:
soup.find_all('p')[11]

In [None]:
soup.find_all('p')[12]

In [None]:
soup.find_all('p')[16]

In [None]:
soup.find_all('p')[21]

In [None]:
soup.find_all('p')[26]

In [None]:
soup.find_all('p')[51]

In [None]:
job = []
for i in range(0,9):
    title = soup.find_all('h3')[i].get_text()
    rank = title.split('. ')[0]
    tle = title.split('. ')[1]
    job.append((rank,tle))

In [None]:
job

In [None]:
metrics = []
for i in range(0,2):
    unemp = soup.find_all('p')[10+i*6].get_text()
    rate = unemp.split(': ')[1]
    rate = rate.split('%')[0]
    sal_med = soup.find_all('p')[11+i*6].get_text()
    median = sal_med.split('$')[1]
    sal_ave = soup.find_all('p')[12+i*6].get_text()
    average = sal_ave.split('$')[1]
    metrics.append((rate,median,average))
for i in range(0,7):
    unemp = soup.find_all('p')[21+i*5].get_text()
    rate = unemp.split(': ')[1]
    rate = rate.split('%')[0]
    sal_med = soup.find_all('p')[22+i*5].get_text()
    median = sal_med.split('$')[1]
    sal_ave = soup.find_all('p')[23+i*5].get_text()
    average = sal_ave.split('$')[1]
    metrics.append((rate,median,average))

In [None]:
metrics

In [None]:
len(metrics)

In [None]:
df1 = pd.DataFrame(np.array(job))
df1.columns = ['rank', 'title']
df2 = pd.DataFrame(np.array(metrics))
df2.columns = ['unemprate', 'mediansal', 'avesal']
df1.join(df2, how="left")

In [None]:
df = df1.join(df2, how="left")
df

In [None]:
df.to_csv('ushighpayjobs2017.csv')

### Practice

In [10]:
site = "https://stockx.com/sneakers/most-popular"
hdr = {'User-Agent': 'Mozilla/5.0'}
bookpage = requests.get(site, headers= hdr)
soup = BeautifulSoup(bookpage.text, "lxml")

In [None]:
print(soup.prettify)

In [12]:
soup.find_all('p', class_="chakra-text css-3lpefb")[0].get_text()

'adidas Yeezy Slide Pure (Restock Pair)'

In [None]:
soup.find_all('p', class_="chakra-text css-9ryi0c")[0].get_text()

In [13]:
soup.find_all('p', class_="chakra-text css-guwsyb")[0].get_text()

'2788 sold'

In [14]:
sneakers = []
for i in range(40):
    name = soup.find_all('p', class_="chakra-text css-3lpefb")[i].get_text()
    askPrice = soup.find_all('p', class_="chakra-text css-9ryi0c")[i].get_text()
    quanity = soup.find_all('p', class_="chakra-text css-guwsyb")[i].get_text()[:4]
    sneakers.append((name, askPrice, quanity))
    
df = pd.DataFrame(np.array(sneakers))
df.coloums = ['name', 'askPrice', 'quanity']
df

  df.coloums = ['name', 'askPrice', 'quanity']


Unnamed: 0,0,1,2
0,adidas Yeezy Slide Pure (Restock Pair),$113,2788
1,adidas Yeezy Slide Ochre,$95,2180
2,Jordan 6 Retro UNC White,$275,2071
3,adidas Yeezy Slide Onyx,$145,1775
4,Nike Air Force 1 Low '07 White,$96,1682
5,adidas Yeezy Boost 350 V2 Dazzling Blue,$295,1383
6,Nike Dunk Low Retro White Black (2021),$253,1165
7,Jordan 12 Retro Playoffs (2022),$251,995
8,Jordan 6 Retro Mint Foam (W),$222,928
9,Jordan 3 Retro Cardinal Red,$250,900


In [None]:
### Create a dataframe with data on Name of sneakers, Lowest Ask price and Quantity sold

#### Resources: 
http://altitudelabs.com/blog/web-scraping-with-python-and-beautiful-soup/
http://web.stanford.edu/~zlotnick/TextAsData/Web_Scraping_with_Beautiful_Soup.html  
https://www.crummy.com/software/BeautifulSoup/bs4/doc/  
https://www.dataquest.io/blog/web-scraping-beautifulsoup/