In [1]:
# imports the pandas and BeautifulSoup libraries and requests module.

import pandas as pd
from bs4 import BeautifulSoup as BS
import requests
import numpy as np

In [2]:
#Get request by using request.get()and pass in the desired URL

URL ="https://wisevoter.com/country-rankings/gdp-by-country/"
response = requests.get(URL)

soup = BS(response.content,"html.parser")

In [3]:
type(response)

requests.models.Response

In [4]:
#use this status code to check if the request was successful or not

response.status_code

if response.status_code == 200:
    print('Request was successful')
else:
    print('Not successful')

Request was successful


In [5]:
#extract the header of a table in Beautiful Soup
GDP = soup.thead
GDP

<thead><tr><th>#</th><th>Country</th><th>GDP</th><th>GDP Growth</th><th>GDP per capita</th></tr></thead>

In [6]:
#The soup.find_all('tr') method is used to find all the rows in the table.
#The soup.find_all('th') method is used to find all the header cells in each row
#The y.text method is used to extract the text from each header cell and append it to the GDP_row list.
GDP_row = []
for x in soup.find_all('tr'):
    for y in x.find_all('th'):
        GDP_row.append(y.text)
GDP_row

['#', 'Country', 'GDP', 'GDP Growth', 'GDP per capita']

In [7]:
#extract the body of a table from an HTML page using Beautiful Soup
GDP_body = soup.find('tbody').prettify()

In [8]:
#The soup.find_all('tr') method is used to find all the rows in the table
#The x.find_all('td') method is used to find all the cells in each row
#The y.text for y in td_tags method is used to extract the text from each cell and append it to the td_value list. 
#The table_value.append(td_value) is used to append the list of cell values for each row to the table_value list.
table_value = []
for x in soup.find_all('tr'):
    td_tags = x.find_all('td')
    td_value = [y.text for y in td_tags]
    table_value.append(td_value)

In [9]:
#The first line creates a dataframe using the table_value and GDP_row data.
#The second line drops the first row of the dataframe using the .drop() method.
GDP_df = pd.DataFrame(table_value, columns=GDP_row)
GDP_df = GDP_df.drop(GDP_df.index[0])
GDP_df.head()

Unnamed: 0,#,Country,GDP,GDP Growth,GDP per capita
1,1,United States of America,23.3 trillion,3.7%,"$70,248.63"
2,2,People's Republic of China,17.7 trillion,4.4%,"$12,556.33"
3,3,Japan,4.9 trillion,2.4%,"$39,312.66"
4,4,Germany,4.3 trillion,2.1%,"$51,203.55"
5,5,India,3.2 trillion,8.2%,"$2,256.59"


In [10]:
GDP_df['GDP per capita'] = GDP_df['GDP per capita'].replace({'\$':''}, regex = True)
GDP_df.head()

Unnamed: 0,#,Country,GDP,GDP Growth,GDP per capita
1,1,United States of America,23.3 trillion,3.7%,70248.63
2,2,People's Republic of China,17.7 trillion,4.4%,12556.33
3,3,Japan,4.9 trillion,2.4%,39312.66
4,4,Germany,4.3 trillion,2.1%,51203.55
5,5,India,3.2 trillion,8.2%,2256.59


In [11]:
#The .to_csv() method is used to save the dataframe as a CSV file.
GDP_df.to_csv('GDP_df.to_csv', index=False)

In [12]:
# get the response in the form of html
wikiurl="https://en.wikipedia.org/wiki/List_of_countries_by_HIV/AIDS_adult_prevalence_rate"
table_class="ve-ce-branchNode ve-ce-tableNode static-row-numbers ve-ce-mwTableNode wikitable sortable jquery-tablesorter"
response2=requests.get(wikiurl)
print(response2.status_code)

200


In [13]:
## parse data from the html into a beautifulsoup object
soup2 = BS(response2.text, 'html.parser')
HIV=soup2.find('table',{'class':"wikitable"})
HIV

<table class="wikitable sortable static-row-numbers" style="text-align:right">
<caption>
</caption>
<tbody><tr>
<th>Country/Region
</th>
<th>Adult prevalence <br/> of HIV/AIDS<sup class="reference" id="cite_ref-CIAHIVprevalence_1-2"><a href="#cite_note-CIAHIVprevalence-1">[1]</a></sup>
</th>
<th>Number of people <br/> with HIV/AIDS
</th>
<th>Annual deaths <br/> from HIV/AIDS<sup class="reference" id="cite_ref-CIAHIVdeaths_3-1"><a href="#cite_note-CIAHIVdeaths-3">[3]</a></sup>
</th>
<th>Year of estimate
</th></tr>
<tr>
<td style="text-align:left;"><span class="flagicon"><span class="mw-image-border" typeof="mw:File"><span><img alt="" class="mw-file-element" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Flag_of_Eswatini.svg/23px-Flag_of_Eswatini.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/f/fb/Flag_of_Eswatini.svg/35px-Flag_of_Eswatini.svg.png 1.5x, //upload.wikimedia.org/wikipedia/co

In [14]:
HIV_df=pd.read_html(str(HIV))
# convert list to dataframe
HIV_df=pd.DataFrame(HIV_df[0])


In [15]:
#Used this  code to clean the data. renames the columns of the DataFrame and replaces - with NaN. Then it drops rows 
# where all values are NaN and drops any remaining NaN values.
HIV_df.rename(columns={'Country/Region': 'Country','Adult prevalence of HIV/AIDS[1]':'% of Adult with HIV',
                       'Number of people with HIV/AIDS':'people with HIV','Annual deaths from HIV/AIDS[3]':
                       'Annual deaths from HIV'}, inplace=True)
#HIV_df.replace('-', np.nan, inplace=True)
#HIV_df.dropna(how='all', inplace=True)
#HIV_df.dropna(inplace=True)
HIV_df.head()

Unnamed: 0,Country,% of Adult with HIV,people with HIV,Annual deaths from HIV,Year of estimate
0,Eswatini,28.30%,225000,2500,2023
1,Lesotho,23.40%,350000,5000,2023
2,Botswana,22.35%,392000,4990,2023
3,Zimbabwe,21.60%,1600000,25000,2023
4,South Africa,14%,9000000,75000,2023


In [16]:
HIV_df.to_csv('HIV5.csv', index=False)

In [17]:
#Get request by using request.get()and pass in the desired URL
world="https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
table_class="wikitable sortable static-row-numbers plainrowheaders srn-white-background jquery-tablesorter"
response3=requests.get(world)
print(response3.status_code)

200


In [18]:
## parse data from the html into a beautifulsoup object
soup3 = BS(response3.text, 'html.parser')
world_pop=soup3.find('table',{'class':"wikitable"})


In [19]:
world_pop_df=pd.read_html(str(world_pop))
# convert list to dataframe
world_pop_df=pd.DataFrame(world_pop_df[0])
world_pop_df.rename(columns={'Country / Area': 'Country','UN continental region[4]':'UN region','UN statistical subregion[4]':'UN subregion'}, inplace=True)
world_pop_df.head()

Unnamed: 0,Country,UN region,UN subregion,Population (1 July 2022),Population (1 July 2023),Change
0,India,Asia,Southern Asia,1417173173,1428627663,+0.81%
1,China[a],Asia,Eastern Asia,1425887337,1425671352,−0.02%
2,United States[b],Americas,Northern America,338289857,339996564,+0.50%
3,Indonesia,Asia,Southeastern Asia,275501339,277534123,+0.74%
4,Pakistan,Asia,Southern Asia,235824863,240485658,+1.98%


In [20]:
world_pop_df.to_csv('world_pop.csv', index=False)

In [21]:
#I use This code to merges three data frames (GDP_df, HIV_df, and world_pop_df) on the column named ‘Country’. 
#The resulting data frame is assigned to the variable ‘world_gdp_hiv_df’
world_GDP_HIV_df = pd.merge(GDP_df, pd.merge(HIV_df, world_pop_df, on='Country'), on='Country')
world_GDP_HIV_df.head()

Unnamed: 0,#,Country,GDP,GDP Growth,GDP per capita,% of Adult with HIV,people with HIV,Annual deaths from HIV,Year of estimate,UN region,UN subregion,Population (1 July 2022),Population (1 July 2023),Change
0,3,Japan,4.9 trillion,2.4%,39312.66,0.02%,21739,-,2022,Asia,Eastern Asia,123951692,123294513,−0.53%
1,4,Germany,4.3 trillion,2.1%,51203.55,0.25%,100000,-,2023,Europe,Western Europe,83369843,83294633,−0.09%
2,5,India,3.2 trillion,8.2%,2256.59,0.20%,2100000,69000,2017[13],Asia,Southern Asia,1417173173,1428627663,+0.81%
3,6,United Kingdom,3.1 trillion,3.7%,46510.28,0.20%,100000,-,2017,Europe,Northern Europe,67508936,67736802,+0.34%
4,8,Italy,2.1 trillion,2.3%,35657.5,0.30%,130000,-,2016,Europe,Southern Europe,59037474,58870763,−0.28%


In [22]:
world_GDP_HIV_df.to_csv('world_GDP_HIV.csv', index=False)