In [1]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

# URL request and content loading

In [2]:
r = requests.get("https://pythonizing.github.io/data/example.html")
c = r.content  # Source code HTML

In [3]:
print(f"{type(c)}\n{'-'*50}\n{c}")

<class 'bytes'>
--------------------------------------------------
b'\n<!DOCTYPE html>\n<html>\n<head>\n<style>\ndiv.cities {\n    background-color:black;\n    color:white;\n    margin:20px;\n    padding:20px;\n} \n</style>\n</head>\n<body>\n\n<h1 align="center"> Here are three big cities </h1>\n\n<div class="cities">\n<h2>London</h2>\n<p>London is the capital of England and it\'s been a British settlement since 2000 years ago. </p>\n</div>\n\n<div class="cities">\n<h2>Paris</h2>\n<p>Paris is the capital city of France. It was declared capital since 508.</p>\n</div>\n\n<div class="cities">\n<h2>Tokyo</h2>\n<p>Tokyo is the capital of Japan and one of the most populated cities in the world.</p>\n</div>\n\n</body>\n</html>\n'


# BeautifulSoup

In [4]:
soup = BeautifulSoup(c, "html.parser")  # Pass the content

## Extracting the tags
* find: only to find the first item, tag type
* find_all: to find all the searched tags, list type

### Div tags

In [5]:
all = soup.find_all("div", {"class": "cities"})
all

[<div class="cities">
 <h2>London</h2>
 <p>London is the capital of England and it's been a British settlement since 2000 years ago. </p>
 </div>,
 <div class="cities">
 <h2>Paris</h2>
 <p>Paris is the capital city of France. It was declared capital since 508.</p>
 </div>,
 <div class="cities">
 <h2>Tokyo</h2>
 <p>Tokyo is the capital of Japan and one of the most populated cities in the world.</p>
 </div>]

### H2 & P tags inside the div tags
Must specify the indexing first

In [6]:
all[0].find_all("h2")[0].text

'London'

In [7]:
all[0].find_all("p")[0].text

"London is the capital of England and it's been a British settlement since 2000 years ago. "

In [8]:
for item in all:
    print(item.find_all("h2")[0].text)

London
Paris
Tokyo


# Loading into Dataframe

In [9]:
names = [item.find_all("h2")[0].text for item in all]
infos = [item.find_all("p")[0].text for item in all]
df = pd.DataFrame({"City":names, "Info":infos})

In [10]:
df

Unnamed: 0,City,Info
0,London,London is the capital of England and it's been...
1,Paris,Paris is the capital city of France. It was de...
2,Tokyo,Tokyo is the capital of Japan and one of the m...
