# Wikipedia Webscraping
Goal - You should be able to collect any data from any given Wikipedia URL

In [1]:
%pip install beautifulsoup4 requests pydantic lxml

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [14]:
from warnings import filterwarnings
filterwarnings("ignore")

### How to do Webscraping

In [3]:
url1 = "https://en.wikipedia.org/wiki/World_population"
print(url1)

https://en.wikipedia.org/wiki/World_population


In [4]:
import requests
response = requests.get(url1)
response

<Response [200]>

In [5]:
response.content[0:100]

b'<!DOCTYPE html>\n<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-la'

In [6]:
# Get beautiful soup to fetch particular content
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.content)

### Get the head title for website

In [7]:
soup.title.text

'World population - Wikipedia'

In [8]:
h1_tag = soup.find("h1")
h1_tag

<h1 class="firstHeading mw-first-heading" id="firstHeading"><span class="mw-page-title-main">World population</span></h1>

In [9]:
h1_tag.text

'World population'

In [10]:
h2_tags = soup.find_all("h2")
h2_tags

[<h2 class="vector-pinnable-header-label">Contents</h2>,
 <h2 id="History">History</h2>,
 <h2 id="Global_demographics">Global demographics</h2>,
 <h2 id="Population_by_region">Population by region</h2>,
 <h2 id="Largest_populations_by_country">Largest populations by country</h2>,
 <h2 id="Fluctuation">Fluctuation</h2>,
 <h2 id="Mathematical_approximations">Mathematical approximations</h2>,
 <h2 id="Number_of_humans_who_have_ever_lived">Number of humans who have ever lived</h2>,
 <h2 id="Human_population_as_a_function_of_food_availability">Human population as a function of food availability</h2>,
 <h2 id="See_also">See also</h2>,
 <h2 id="Explanatory_notes">Explanatory notes</h2>,
 <h2 id="References">References</h2>,
 <h2 id="Further_reading">Further reading</h2>,
 <h2 id="External_links">External links</h2>]

In [11]:
h2_texts = [tag.text for tag in h2_tags]
h2_texts

['Contents',
 'History',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Fluctuation',
 'Mathematical approximations',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Further reading',
 'External links']

In [12]:
div_tags = soup.find_all(name="div", class_="mw-heading")
div_tags

[<div class="mw-heading mw-heading2"><h2 id="History">History</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ancient_and_post-classical_history">Ancient and post-classical history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Modern_history">Modern history</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Milestones_by_the_billions">Milestones by the billions</h3></div>,
 <div class="mw-heading mw-heading2"><h2 id="Global_demographics">Global demographics</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Population_by_region">Population by region</h2></div>,
 <div class="mw-heading mw-heading2"><h2 id="Largest_populations_by_country">Largest populations by country</h2></div>,
 <div class="mw-heading mw-heading3"><h3 id="Ten_most_populous_countries">Ten most populous countries</h3></div>,
 <div class="mw-heading mw-heading3"><h3 id="Most_densely_populated_countries">Most densely populated countries</h3></div>,
 <div class="mw-heading mw-heading2"><h2 i

In [13]:
heading_texts = [tag.text for tag in div_tags]
heading_texts

['History',
 'Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Ten most populous countries',
 'Most densely populated countries',
 'Fluctuation',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Mathematical approximations',
 'Years for world population to double',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Citations',
 'General and cited sources',
 'Further reading',
 'External links']

### Getting images from the website

In [14]:
a_tags = soup.find_all("a", class_="mw-file-description")
a_tags

[<a class="mw-file-description" href="/wiki/File:World_Population_Prospects.svg"><img class="mw-file-element" data-file-height="676" data-file-width="900" decoding="async" height="225" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/300px-World_Population_Prospects.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/450px-World_Population_Prospects.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/600px-World_Population_Prospects.svg.png 2x" width="300"/></a>,
 <a class="mw-file-description" href="/wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png"><img class="mw-file-element" data-file-height="7747" data-file-width="5201" decoding="async" height="328" src="//upload.wikimedia.org/wikipedia/commons/thumb/d/d2/Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png/220px-Illustration_of_contempor

In [15]:
a_tags[0]

<a class="mw-file-description" href="/wiki/File:World_Population_Prospects.svg"><img class="mw-file-element" data-file-height="676" data-file-width="900" decoding="async" height="225" src="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/300px-World_Population_Prospects.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/450px-World_Population_Prospects.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/0/0e/World_Population_Prospects.svg/600px-World_Population_Prospects.svg.png 2x" width="300"/></a>

In [16]:
a_tags[0].get("href")

'/wiki/File:World_Population_Prospects.svg'

In [17]:
home_page = "https://en.wikipedia.org/"

In [18]:
home_page + a_tags[0].get("href")

'https://en.wikipedia.org//wiki/File:World_Population_Prospects.svg'

In [19]:
img_links = [home_page + tag.get("href") for tag in a_tags]
img_links

['https://en.wikipedia.org//wiki/File:World_Population_Prospects.svg',
 'https://en.wikipedia.org//wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png',
 'https://en.wikipedia.org//wiki/File:2006megacities.svg',
 'https://en.wikipedia.org//wiki/File:Expectancy_of_life.svg',
 'https://en.wikipedia.org//wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg',
 'https://en.wikipedia.org//wiki/File:Global_population_cartogram.png',
 'https://en.wikipedia.org//wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png',
 'https://en.wikipedia.org//wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg',
 'https://en.wikipedia.org//wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg',
 'https://en.wikipedia.org//wiki/File:World_population_(UN).svg',
 'https://en.wikipedia.org//wiki/File:Total_Fertility_Rate_Map_by_Country.svg',
 'https://en.wikipedia.org//wiki/File:World_population_counter,_Eureka,_Hal

### Get all the tables and store results in pandas dataframe

In [20]:
table_tags = soup.find_all("table", class_="wikitable")

In [21]:
table_tags[0]

<table class="wikitable" style="text-align:center; float:right; clear:right; margin-left:8px; margin-right:0;">
<caption>World population milestones in billions<sup class="reference" id="cite_ref-:6_58-0"><a href="#cite_note-:6-58"><span class="cite-bracket">[</span>58<span class="cite-bracket">]</span></a></sup> (Worldometers estimates)
</caption>
<tbody><tr>
<th scope="row">Population
</th>
<th scope="col">1
</th>
<th scope="col">2
</th>
<th scope="col">3
</th>
<th scope="col">4
</th>
<th scope="col">5
</th>
<th scope="col">6
</th>
<th scope="col">7
</th>
<th scope="col">8
</th>
<th scope="col">9
</th>
<th scope="col">10
</th></tr>
<tr>
<th scope="row">Year
</th>
<td>1804</td>
<td>1927</td>
<td>1960</td>
<td>1974</td>
<td>1987</td>
<td>1999</td>
<td>2011</td>
<td>2022</td>
<td><i>2037</i></td>
<td><i>2057</i>
</td></tr>
<tr>
<th scope="row">Years elapsed
</th>
<td>200,000+</td>
<td>123</td>
<td>33</td>
<td>14</td>
<td>13</td>
<td>12</td>
<td>12</td>
<td>11</td>
<td><i>15</i></td>
<td

In [22]:
import pandas as pd
df = pd.read_html(str(table_tags[0]))[0]

In [23]:
df

Unnamed: 0,Population,1,2,3,4,5,6,7,8,9,10
0,Year,1804,1927,1960,1974,1987,1999,2011,2022,2037,2057
1,Years elapsed,"200,000+",123,33,14,13,12,12,11,15,20


In [24]:
dfs = []
for tag in table_tags:
    t = pd.read_html(str(tag))[0]
    dfs.append(t)

In [25]:
for i in dfs[0:3]:
    display(i)

Unnamed: 0,Population,1,2,3,4,5,6,7,8,9,10
0,Year,1804,1927,1960,1974,1987,1999,2011,2022,2037,2057
1,Years elapsed,"200,000+",123,33,14,13,12,12,11,15,20


Unnamed: 0,Region,2022 (percent),2030 (percent),2050 (percent)
0,Sub-Saharan Africa,"1,152 (14.51%)","1,401 (16.46%)","2,094 (21.62%)"
1,Northern Africa and Western Asia,549 (6.91%),617 (7.25%),771 (7.96%)
2,Central Asia and Southern Asia,"2,075 (26.13%)","2,248 (26.41%)","2,575 (26.58%)"
3,Eastern Asia and Southeastern Asia,"2,342 (29.49%)","2,372 (27.87%)","2,317 (23.92%)"
4,Europe and Northern America,"1,120 (14.10%)","1,129 (13.26%)","1,125 (11.61%)"
5,Latin America and the Caribbean,658 (8.29%),695 (8.17%),749 (7.73%)
6,Australia and New Zealand,31 (0.39%),34 (0.40%),38 (0.39%)
7,Oceania,14 (0.18%),15 (0.18%),20 (0.21%)
8,World,7942,8512,9687


Unnamed: 0,Region,Density (inhabitants/km2),Population (millions),Most populous country,Most populous city (metropolitan area)
0,Asia,104.1,4641,"1,439,090,595 – India","13,515,000 – Tokyo Metropolis (37,400,000 – Gr..."
1,Africa,44.4,1340,"0,211,401,000 – Nigeria","09,500,000 – Cairo (20,076,000 – Greater Cairo)"
2,Europe,73.4,747,"0,146,171,000 – Russia, approx. 110 million in...","13,200,000 – Moscow (20,004,000 – Moscow metro..."
3,Latin America,24.1,653,"0,214,103,000 – Brazil","12,252,000 – São Paulo City (21,650,000 – São ..."
4,Northern America[note 1],14.9,368,"0,332,909,000 – United States","08,804,000 – New York City (23,582,649 – New Y..."
5,Oceania,5,42,"0,025,917,000 – Australia","05,367,000 – Sydney"
6,Antarctica,~0,0.004[86],N/A[note 2],"00,001,258 – McMurdo Station"


### Create a wikipedia scraper class to scrape any wikipedia webpage

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
from pydantic import BaseModel, HttpUrl

In [2]:
class WikiScraper(BaseModel):
    url: HttpUrl

    def get_content(self):
        response = requests.get(self.url)
        response.raise_for_status()
        return response.content
    
    def get_soup(self):
        content = self.get_content()
        soup = BeautifulSoup(content)
        return soup
    
    def get_title(self):
        soup = self.get_soup()
        return soup.title.text
    
    def get_h1_tag(self):
        soup = self.get_soup()
        h1_tag = soup.find("h1")
        return h1_tag.text
    
    def get_subheadings(self):
        soup = self.get_soup()
        div_tags = soup.find_all(name="div", class_="mw-heading")
        subheading_texts = [tag.text for tag in div_tags]
        return subheading_texts

    def get_image_links(self):
        soup = self.get_soup()
        a_tags = soup.find_all("a", class_="mw-file-description")
        home_page = "https://en.wikipedia.org/"
        img_links = [home_page + tag.get("href") for tag in a_tags]
        return img_links
    
    def get_tables(self):
        soup = self.get_soup()
        table_tags = soup.find_all("table", class_="wikitable")
        dfs = []
        for tag in table_tags:
            t = pd.read_html(str(tag))[0]
            dfs.append(t)
        return dfs


In [3]:
scraper1 = WikiScraper(url="RandomText")

ValidationError: 1 validation error for WikiScraper
url
  Input should be a valid URL, relative URL without a base [type=url_parsing, input_value='RandomText', input_type=str]
    For further information visit https://errors.pydantic.dev/2.9/v/url_parsing

In [4]:
scraper1 = WikiScraper(url = "https://en.wikipedia.org/wiki/World_population")
scraper1

WikiScraper(url=Url('https://en.wikipedia.org/wiki/World_population'))

In [5]:
scraper1.get_title()

'World population - Wikipedia'

In [6]:
scraper1.get_image_links()

['https://en.wikipedia.org//wiki/File:World_Population_Prospects.svg',
 'https://en.wikipedia.org//wiki/File:Illustration_of_contemporary_and_past_human_populations_Our_World_in_Data.png',
 'https://en.wikipedia.org//wiki/File:2006megacities.svg',
 'https://en.wikipedia.org//wiki/File:Expectancy_of_life.svg',
 'https://en.wikipedia.org//wiki/File:Population_pyramid_of_the_world_in_continental_groupings_2023.svg',
 'https://en.wikipedia.org//wiki/File:Global_population_cartogram.png',
 'https://en.wikipedia.org//wiki/File:People%27s_-Km%C2%B2_for_all_countries_(and_us_states,_uk_kingdoms).png',
 'https://en.wikipedia.org//wiki/File:Top_5_Country_Population_Graph_1901_to_2021.svg',
 'https://en.wikipedia.org//wiki/File:Population_Density,_v4.11,_2020_(48009093621).jpg',
 'https://en.wikipedia.org//wiki/File:World_population_(UN).svg',
 'https://en.wikipedia.org//wiki/File:Total_Fertility_Rate_Map_by_Country.svg',
 'https://en.wikipedia.org//wiki/File:World_population_counter,_Eureka,_Hal

In [7]:
scraper1.get_subheadings()

['History',
 'Ancient and post-classical history',
 'Modern history',
 'Milestones by the billions',
 'Global demographics',
 'Population by region',
 'Largest populations by country',
 'Ten most populous countries',
 'Most densely populated countries',
 'Fluctuation',
 'Annual population growth',
 'Population growth by region',
 'Past population',
 'Projections',
 'Mathematical approximations',
 'Years for world population to double',
 'Number of humans who have ever lived',
 'Human population as a function of food availability',
 'See also',
 'Explanatory notes',
 'References',
 'Citations',
 'General and cited sources',
 'Further reading',
 'External links']

### Scraping other website

In [8]:
scraper2 = WikiScraper(url="https://en.wikipedia.org/wiki/Python_(programming_language)")
scraper2

WikiScraper(url=Url('https://en.wikipedia.org/wiki/Python_(programming_language)'))

In [9]:
scraper2.get_title()

'Python (programming language) - Wikipedia'

In [10]:
scraper2.get_h1_tag()

'Python (programming language)'

In [11]:
scraper2.get_image_links()

['https://en.wikipedia.org//wiki/File:Python-logo-notext.svg',
 'https://en.wikipedia.org//wiki/File:Wikibooks-logo-en-noslogan.svg',
 'https://en.wikipedia.org//wiki/File:Guido_van_Rossum_OSCON_2006_cropped.png',
 'https://en.wikipedia.org//wiki/File:Hello_World_in_Python.png',
 'https://en.wikipedia.org//wiki/File:Af-Helloworld_(C_Sharp).svg',
 'https://en.wikipedia.org//wiki/File:Python_3._The_standard_type_hierarchy-en.svg',
 'https://en.wikipedia.org//wiki/File:Python_Powered.png',
 'https://en.wikipedia.org//wiki/File:Octicons-terminal.svg',
 'https://en.wikipedia.org//wiki/File:Python-logo-notext.svg',
 'https://en.wikipedia.org//wiki/File:Symbol_portal_class.svg',
 'https://en.wikipedia.org//wiki/File:Symbol_portal_class.svg']

In [12]:
scraper2.get_subheadings()

['History[edit]',
 'Design philosophy and features[edit]',
 'Syntax and semantics[edit]',
 'Indentation[edit]',
 'Statements and control flow[edit]',
 'Expressions[edit]',
 'Methods[edit]',
 'Typing[edit]',
 'Arithmetic operations[edit]',
 'Programming examples[edit]',
 'Libraries[edit]',
 'Development environments[edit]',
 'Implementations[edit]',
 'Reference implementation[edit]',
 'Other implementations[edit]',
 'No longer supported implementations[edit]',
 'Cross-compilers to other languages[edit]',
 'Performance[edit]',
 'Development[edit]',
 'API documentation generators[edit]',
 'Naming[edit]',
 'Popularity[edit]',
 'Uses[edit]',
 'Languages influenced by Python[edit]',
 'See also[edit]',
 'References[edit]',
 'Sources[edit]',
 'Further reading[edit]',
 'External links[edit]']

In [16]:
dfs2 = scraper2.get_tables()

In [17]:
dfs2[0]

Unnamed: 0,Type,Mutability,Description,Syntax examples
0,bool,immutable,Boolean value,True False
1,bytearray,mutable,Sequence of bytes,"bytearray(b'Some ASCII') bytearray(b""Some ASCI..."
2,bytes,immutable,Sequence of bytes,"b'Some ASCII' b""Some ASCII"" bytes([119, 105, 1..."
3,complex,immutable,Complex number with real and imaginary parts,3+2.7j 3 + 2.7j
4,dict,mutable,Associative array (or dictionary) of key and v...,"{'key1': 1.0, 3: False} {}"
5,types.EllipsisType,immutable,An ellipsis placeholder to be used as an index...,... Ellipsis
6,float,immutable,Double-precision floating-point number. The pr...,1.33333
7,frozenset,immutable,"Unordered set, contains no duplicates; can con...","frozenset([4.0, 'string', True])"
8,int,immutable,Integer of unlimited magnitude[121],42
9,list,mutable,"List, can contain mixed types","[4.0, 'string', True] []"


### Scraping one more url

In [18]:
scraper3 = WikiScraper(url="https://en.wikipedia.org/wiki/Data_science")
scraper3

WikiScraper(url=Url('https://en.wikipedia.org/wiki/Data_science'))

In [19]:
scraper3.get_title()

'Data science - Wikipedia'

In [20]:
scraper3.get_subheadings()

['Foundations[edit]',
 'Relationship to statistics[edit]',
 'Etymology[edit]',
 'Early usage[edit]',
 'Modern usage[edit]',
 'Data science and data analysis[edit]',
 'Cloud computing for data science[edit]',
 'Ethical consideration in data science[edit]',
 'See also[edit]',
 'References[edit]']

In [21]:
scraper3.get_image_links()

['https://en.wikipedia.org//wiki/File:PIA23792-1600x1200(1).jpg',
 'https://en.wikipedia.org//wiki/File:EDA_example_-_Always_plot_your_data.jpg',
 'https://en.wikipedia.org//wiki/File:Cloud_computing_in_enabling_data_science_at_scale.jpg']

### From given 5 wikipedia urls scrape all the image links as dictionary

In [22]:
urls = [
    "https://en.wikipedia.org/wiki/Data_science",
    "https://en.wikipedia.org/wiki/Data_analysis",
    "https://en.wikipedia.org/wiki/JavaScript",
    "https://en.wikipedia.org/wiki/Rust_(programming_language)",
    "https://en.wikipedia.org/wiki/Python_(programming_language)"
]

In [29]:
images = {}
for i in urls:
    scraper = WikiScraper(url=i)
    print(scraper)
    h1 = scraper.get_h1_tag()
    links = scraper.get_image_links()
    images[h1] = links
    print("\n========================================\n")

url=Url('https://en.wikipedia.org/wiki/Data_science')


url=Url('https://en.wikipedia.org/wiki/Data_analysis')


url=Url('https://en.wikipedia.org/wiki/JavaScript')


url=Url('https://en.wikipedia.org/wiki/Rust_(programming_language)')


url=Url('https://en.wikipedia.org/wiki/Python_(programming_language)')




In [30]:
images

{'Data science': ['https://en.wikipedia.org//wiki/File:PIA23792-1600x1200(1).jpg',
  'https://en.wikipedia.org//wiki/File:EDA_example_-_Always_plot_your_data.jpg',
  'https://en.wikipedia.org//wiki/File:Cloud_computing_in_enabling_data_science_at_scale.jpg'],
 'Data analysis': ['https://en.wikipedia.org//wiki/File:Rayleigh-Taylor_instability.jpg',
  'https://en.wikipedia.org//wiki/File:Data_visualization_process_v1.png',
  'https://en.wikipedia.org//wiki/File:Relationship_of_data,_information_and_intelligence.png',
  'https://en.wikipedia.org//wiki/File:Social_Network_Analysis_Visualization.png',
  'https://en.wikipedia.org//wiki/File:Total_Revenues_and_Outlays_as_Percent_GDP_2013.png',
  'https://en.wikipedia.org//wiki/File:U.S._Phillips_Curve_2000_to_2013.png',
  'https://en.wikipedia.org//wiki/File:US_Employment_Statistics_-_March_2015.png',
  'https://en.wikipedia.org//wiki/File:User-activities.png'],
 'JavaScript': ['https://en.wikipedia.org//wiki/File:JavaScript_code.png',
  'htt

In [26]:
images.keys()

dict_keys(['Data science', 'Data analysis', 'JavaScript', 'Rust (programming language)', 'Python (programming language)'])

In [27]:
images.get("JavaScript")

['https://en.wikipedia.org//wiki/File:JavaScript_code.png',
 'https://en.wikipedia.org//wiki/File:Wikibooks-logo-en-noslogan.svg',
 'https://en.wikipedia.org//wiki/File:Node.js_logo.svg',
 'https://en.wikipedia.org//wiki/File:Octicons-terminal.svg']

In [31]:
images.get('Data science')

['https://en.wikipedia.org//wiki/File:PIA23792-1600x1200(1).jpg',
 'https://en.wikipedia.org//wiki/File:EDA_example_-_Always_plot_your_data.jpg',
 'https://en.wikipedia.org//wiki/File:Cloud_computing_in_enabling_data_science_at_scale.jpg']