# Webscraping - Data Collection
You can get any data directly from internet

In [1]:
%pip install requests beautifulsoup4 lxml

Collecting lxml
  Using cached lxml-5.3.0-cp312-cp312-win_amd64.whl.metadata (3.9 kB)
Using cached lxml-5.3.0-cp312-cp312-win_amd64.whl (3.8 MB)
Installing collected packages: lxml
Successfully installed lxml-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [2]:
from warnings import filterwarnings
filterwarnings("ignore")

### Get the head title for any webpage

In [3]:
url1 = "https://en.wikipedia.org/wiki/World_population"
print(url1)

https://en.wikipedia.org/wiki/World_population


In [4]:
import requests

response = requests.get(url1)

response

<Response [200]>

### 200 - Ok all the data is collected

In [7]:
data = response.content

print(data[0:1000])

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-1 vector-feature-appearance-pinned-clientpref-1 vector-feature-night-mode-enabled skin-theme-clientpref-day vector-toc-available" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8">\n<title>World population - Wikipedia</title>\n<script>(function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 

### Get the title with beautiful soup package

In [9]:
from bs4 import BeautifulSoup

soup = BeautifulSoup(data)

In [11]:
title_tag = soup.find("title")
title_tag

<title>World population - Wikipedia</title>

In [12]:
title_text = title_tag.text
title_text

'World population - Wikipedia'

### Find out h1 tag from body

In [13]:
h1_tag = soup.find("h1", class_="firstHeading")
h1_tag

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">World population</span></h1>

In [14]:
h1_text = h1_tag.text
h1_text

'World population'

### Get all the h2 tags

In [16]:
heading_divs = soup.find_all("div", class_="mw-heading")
heading_divs

[<div class="mw-heading mw-heading2"><h2 id="History">History</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ancient_and_post-classical_history">Ancient and post-classical history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Modern_history">Modern history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Milestones_by_the_billions">Milestones by the billions</h3></div>,
 <div class="mw-heading mw-heading2"><h2 id="Global_demographics">Global demographics</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Population_by_region">Population by region</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Largest_populations_by_country">Largest populations by country</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ten_most_populous_countries">Ten most populous countries</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Most_densely_populated_countries">Most densely populated countries</h3></div>,
 <div class="mw-heading mw-heading2"><h2 i

In [21]:
heading_divs[0].find("h2").text

'History'

In [34]:
h2_tags = [
    tag.find("h2").text 
    for tag in heading_divs 
    if tag.find("h2") is not None
]
h2_tags

['History',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Fluctuation',
 'Mathematical approximations',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Further reading',
 'External links']

In [35]:
h3_tags = [
    tag.find("h3").text
    for tag in heading_divs
    if tag.find("h3") is not None
]

In [36]:
h3_tags

['Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Ten most populous countries',
 'Most densely populated countries',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Years for world population to double',
 'Citations',
 'General and cited sources']

In [26]:
heading_text = [tag.text for tag in heading_divs]

In [27]:
heading_text

['History',
 'Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Ten most populous countries',
 'Most densely populated countries',
 'Fluctuation',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Mathematical approximations',
 'Years for world population to double',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Citations',
 'General and cited sources',
 'Further reading',
 'External links']