Lets find what is the most often text in 'code' tag on a website

In [1]:
url_string = "https://stepik.org/media/attachments/lesson/209719/2.html"

import ssl
from urllib.request import urlopen

ctx = ssl.create_default_context()
ctx.check_hostname = False
ctx.verify_mode = ssl.CERT_NONE

html = urlopen(url_string, context=ctx).read().decode('utf-8')

In [2]:
html.count('<code>')

91

In [3]:
import re
code_arr = re.findall('<code>(.+?)</code>', html)
result = {c : code_arr.count(c) for c in code_arr}
{k : v for k, v in result.items() if v == max(result.values())}

{'else': 4, 'except': 4, 'finally': 4}

In [4]:
# Lets find all links
links = re.findall('<a\*? href="(.+?)"', html)

# BeautifulSoup

In [5]:
from bs4 import BeautifulSoup

In [6]:
url_string2 = "https://dataquestio.github.io/web-scraping-pages/simple.html"
html2 = urlopen(url_string2).read().decode('utf-8')   

In [7]:
soup = BeautifulSoup(html2, 'html.parser')

In [8]:
# We can now print out the HTML content of the page, formatted nicely
print(soup.prettify())

<!DOCTYPE html>
<html>
 <head>
  <title>
   A simple example page
  </title>
 </head>
 <body>
  <p>
   Here is some simple content for this page.
  </p>
 </body>
</html>


In [9]:
[type(item) for item in list(soup.children)]

[bs4.element.Doctype, bs4.element.NavigableString, bs4.element.Tag]

In [10]:
# Each item in the list returned by the children property is also a BeautifulSoup object,
# so we can also call the children method on html.
# Now, we can find the children inside the html tag:
html = list(soup.children)[2]
list(html.children)

['\n',
 <head>
 <title>A simple example page</title>
 </head>,
 '\n',
 <body>
 <p>Here is some simple content for this page.</p>
 </body>,
 '\n']

In [11]:
# As we can see above, there are two tags here, head, and body.
# We want to extract the text inside the p tag, so we’ll dive into the body:
body = list(html.children)[3]
body

<body>
<p>Here is some simple content for this page.</p>
</body>

In [12]:
# Now, we can get the p tag by finding the children of the body tag:
list(body.children)

['\n', <p>Here is some simple content for this page.</p>, '\n']

In [13]:
# We can now isolate the p tag:
p = list(body.children)[1]
p

<p>Here is some simple content for this page.</p>

In [14]:
# Once we’ve isolated the tag, we can use the get_text method to extract all of the text inside the tag:
p.get_text()

'Here is some simple content for this page.'

In [15]:
# Finding all instances of a tag at once
soup.find_all('p')[0]

<p>Here is some simple content for this page.</p>

## Searching for tags by class and id

In [18]:
import requests
page = requests.get("https://dataquestio.github.io/web-scraping-pages/ids_and_classes.html")
soup = BeautifulSoup(page.content, 'html.parser')

In [20]:
soup.find_all('p', class_='outer-text')
soup.find_all(class_="outer-text")
soup.find_all(id="first")

[<p class="inner-text first-item" id="first">
                 First paragraph.
             </p>]

## Downloading weather data

In [22]:
import requests
page = requests.get("https://forecast.weather.gov/MapClick.php?lat=37.7772&lon=-122.4168#.YVseOppBxPY")
soup = BeautifulSoup(page.content, 'html.parser')

In [25]:
seven_day = soup.find(id='seven-day-forecast')
forecast_items = seven_day.find_all(class_="tombstone-container")
tonight = forecast_items[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 78 °F
 </p>
</div>


In [52]:
tonight = soup.find(id='seven-day-forecast').find_all(class_="tombstone-container")[0]
print(tonight.prettify())

<div class="tombstone-container">
 <p class="period-name">
  Today
  <br/>
  <br/>
 </p>
 <p>
  <img alt="Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. " class="forecast-icon" src="newimages/medium/sct.png" title="Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. "/>
 </p>
 <p class="short-desc">
  Mostly Sunny
 </p>
 <p class="temp temp-high">
  High: 78 °F
 </p>
</div>


In [38]:
period = tonight.find(class_="period-name").get_text()
short_desc = tonight.find(class_="short-desc").get_text()
temp = tonight.find(class_="temp").get_text()
print(period)
print(short_desc)
print(temp)

Today
Mostly Sunny
High: 78 °F


In [50]:
tonight.find('img')['title']

'Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. '

## Extracting all the information from the page

Now that we know how to extract each individual piece of information, we can combine our knowledge with CSS selectors and list comprehensions to extract everything at once.

    * Select all items with the class period-name inside an item with the class tombstone-container in seven_day.
    * Use a list comprehension to call the get_text method on each BeautifulSoup object.

In [53]:
period_tags = seven_day.select(".tombstone-container .period-name")
periods = [pt.get_text() for pt in period_tags]
periods

['Today',
 'Tonight',
 'Tuesday',
 'TuesdayNight',
 'Wednesday',
 'WednesdayNight',
 'Thursday',
 'ThursdayNight',
 'Friday']

In [55]:
short_descs = [sd.get_text() for sd in seven_day.select(".tombstone-container .short-desc")]
temps = [t.get_text() for t in seven_day.select(".tombstone-container .temp")]
descs = [d["title"] for d in seven_day.select(".tombstone-container img")]
print(short_descs)
print(temps)
print(descs)

['Mostly Sunny', 'Partly Cloudy', 'Partly Sunny', 'Mostly Cloudy', 'Mostly Cloudy', 'Mostly Cloudy', 'Mostly Cloudy', 'Mostly Cloudy', 'Mostly Sunny']
['High: 78 °F', 'Low: 56 °F', 'High: 68 °F', 'Low: 57 °F', 'High: 66 °F', 'Low: 55 °F', 'High: 62 °F', 'Low: 53 °F', 'High: 62 °F']
['Today: Mostly sunny, with a high near 78. Light west northwest wind becoming west 13 to 18 mph in the afternoon. Winds could gust as high as 24 mph. ', 'Tonight: Partly cloudy, with a low around 56. West southwest wind 11 to 20 mph, with gusts as high as 25 mph. ', 'Tuesday: Partly sunny, with a high near 68. West southwest wind 9 to 18 mph, with gusts as high as 24 mph. ', 'Tuesday Night: Mostly cloudy, with a low around 57. West wind 16 to 18 mph, with gusts as high as 24 mph. ', 'Wednesday: Mostly cloudy, with a high near 66. West wind 15 to 21 mph, with gusts as high as 28 mph. ', 'Wednesday Night: Mostly cloudy, with a low around 55.', 'Thursday: Mostly cloudy, with a high near 62.', 'Thursday Night: 

## Combining our data into a Pandas Dataframe

Each dictionary key will become a column in the DataFrame, and each list will become the values in the column:

In [56]:
import pandas as pd
weather = pd.DataFrame({
    "period": periods,
    "short_desc": short_descs,
    "temp": temps,
    "desc":descs
})
weather

Unnamed: 0,period,short_desc,temp,desc
0,Today,Mostly Sunny,High: 78 °F,"Today: Mostly sunny, with a high near 78. Ligh..."
1,Tonight,Partly Cloudy,Low: 56 °F,"Tonight: Partly cloudy, with a low around 56. ..."
2,Tuesday,Partly Sunny,High: 68 °F,"Tuesday: Partly sunny, with a high near 68. We..."
3,TuesdayNight,Mostly Cloudy,Low: 57 °F,"Tuesday Night: Mostly cloudy, with a low aroun..."
4,Wednesday,Mostly Cloudy,High: 66 °F,"Wednesday: Mostly cloudy, with a high near 66...."
5,WednesdayNight,Mostly Cloudy,Low: 55 °F,"Wednesday Night: Mostly cloudy, with a low aro..."
6,Thursday,Mostly Cloudy,High: 62 °F,"Thursday: Mostly cloudy, with a high near 62."
7,ThursdayNight,Mostly Cloudy,Low: 53 °F,"Thursday Night: Mostly cloudy, with a low arou..."
8,Friday,Mostly Sunny,High: 62 °F,"Friday: Mostly sunny, with a high near 62."


We can now do some analysis on the data. 

For example, we can use a regular expression and the Series.str.extract method 

to pull out the numeric temperature values:

In [64]:
temp_nums = weather["temp"].str.extract("([0-9]+)", expand=False)
weather["temp_num"] = temp_nums.astype('int')
temp_nums

0    78
1    56
2    68
3    57
4    66
5    55
6    62
7    53
8    62
Name: temp, dtype: object

In [65]:
is_night = weather["temp"].str.contains("Low")
weather["is_night"] = is_night
is_night

0    False
1     True
2    False
3     True
4    False
5     True
6    False
7     True
8    False
Name: temp, dtype: bool

In [66]:
weather[is_night]

Unnamed: 0,period,short_desc,temp,desc,temp_num,is_night
1,Tonight,Partly Cloudy,Low: 56 °F,"Tonight: Partly cloudy, with a low around 56. ...",56,True
3,TuesdayNight,Mostly Cloudy,Low: 57 °F,"Tuesday Night: Mostly cloudy, with a low aroun...",57,True
5,WednesdayNight,Mostly Cloudy,Low: 55 °F,"Wednesday Night: Mostly cloudy, with a low aro...",55,True
7,ThursdayNight,Mostly Cloudy,Low: 53 °F,"Thursday Night: Mostly cloudy, with a low arou...",53,True


In [3]:
objects = [1, 2, 3, 1, 2]
print(len(set(objects)))


3
