# 04 - More BeautifulSoup practice
Using the soverign states Wikipedia page, you're going to identify different parts of the unconventionally 

## 1. Select all the rows of the first table on the page

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_rows', 255)

In [3]:
# soverign states wikipedia url
states_url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states'

In [10]:
# request the page using GET request

states_r = requests.get(states_url)
states_r

<Response [200]>

In [13]:
# create a beautifulsoup object

doc = BeautifulSoup(states_r.text, 'html.parser')

In [14]:
doc

<!DOCTYPE html>

<html class="client-nojs" dir="ltr" lang="en">
<head>
<meta charset="utf-8"/>
<title>List of sovereign states - Wikipedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"12e69c10-d11b-4b83-b2ff-0872a0d466ca","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_sovereign_states","wgTitle":"List of sovereign states","wgCurRevisionId":1033218383,"wgRevisionId":1033218383,"wgArticleId":68253,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 maint: archived copy as title","Webarchive template other archives","CS1 uses Russian-language 

In [38]:
# select the first table
table = doc.find('table')

In [39]:
# select all the rows
trs = table.find_all('tr')

## 2. Extract text only from all td tags
You'll do this to identify where to separate the two tables

In [40]:
# loop through all the rows and extract only the text
full_table_data = []
for tr in trs[1:]:
    tds = tr.find_all('td')
    cells = []
    for td in tds:
        cells.append(td.text)
    full_table_data.append(cells)

In [41]:
full_table_data[0:5]

[['A AAA\n', 'A AAA\n', 'A AAA\n', '\n'],
 ['ZZZ↓ UN member states and observer states ↓\n', 'A AAA\n', 'ZZZ\n', '\n'],
 ['ZZZAbkhazia → See Abkhazia listing\n'],
 ['\xa0Afghanistan\xa0– Islamic Republic of Afghanistan\n',
  'A UN member state\n',
  'A None\n',
  '\n'],
 ['\xa0Albania\xa0– Republic of Albania\n',
  'A UN member state\n',
  'A None\n',
  '\n']]

In [32]:
# separate the first table from the second with list indexing
states_df = pd.DataFrame(full_table_data[3:226], columns=None)
states_df

Unnamed: 0,0,1,2,3
0,Afghanistan – Islamic Republic of Afghanistan\n,A UN member state\n,A None\n,\n
1,Albania – Republic of Albania\n,A UN member state\n,A None\n,\n
2,Algeria – People's Democratic Republic of Alg...,A UN member state\n,A None\n,\n
3,Andorra – Principality of Andorra\n,A UN member state\n,A None\n,Andorra is a co-principality in which the offi...
4,Angola – Republic of Angola\n,A UN member state\n,A None\n,\n
5,Antigua and Barbuda\n,A UN member state\n,A None\n,Antigua and Barbuda is a Commonwealth realm[e]...
6,Argentina – Argentine Republic[g]\n,A UN member state\n,A None\n,Argentina is a federation of 23 provinces and ...
7,Armenia – Republic of Armenia\n,A UN member state\n,Not recognised by Pakistan.\n,Armenia is not recognised by Pakistan due to t...
8,ZZZArtsakh → See Artsakh listing\n,,,
9,Australia – Commonwealth of Australia\n,A UN member state\n,A None\n,Australia is a Commonwealth realm[e] and a fed...


In [42]:
# check the first couple of entries
states_df.head()

Unnamed: 0,0,1,2,3
0,Afghanistan – Islamic Republic of Afghanistan\n,A UN member state\n,A None\n,\n
1,Albania – Republic of Albania\n,A UN member state\n,A None\n,\n
2,Algeria – People's Democratic Republic of Alg...,A UN member state\n,A None\n,\n
3,Andorra – Principality of Andorra\n,A UN member state\n,A None\n,Andorra is a co-principality in which the offi...
4,Angola – Republic of Angola\n,A UN member state\n,A None\n,\n


In [43]:
# separate the second table from the first with list indexing
other_states_df = pd.DataFrame(full_table_data[229:-2])

In [44]:
# check the first couple of entries
other_states_df.head()

Unnamed: 0,0,1,2,3
0,Abkhazia – Republic of Abkhazia\n,D No membership\n,B Claimed by Georgia Claimed by North Korea Cl...,"Recognised by Russia, Nauru, Nicaragua, Syria,..."
1,Artsakh – Republic of Artsakh[ag]\n,D No membership\n,B Claimed by Georgia Claimed by North Korea Cl...,"A de facto independent state,[56][57][58] reco..."
2,Cook Islands\n,D Member of eight UN specialized agencies\n,A None(See political status)\n,"A state in free association with New Zealand, ..."
3,Kosovo – Republic of Kosovo\n,D Member of two UN specialized agencies\n,B Claimed by Georgia Claimed by North Korea Cl...,Pursuant to United Nations Security Council Re...
4,Niue\n,D Member of five UN specialized agencies\n,A None(See political status)\n,"A state in free association with New Zealand, ..."


## 3. Explore further the contents of each table
### a. States (table 1)
![image](../answers/assets/sovereign-states-tr.png)

In [58]:
# now separate the list based on the information we found from the data frames above and gather the nested tags from each td tag
# we also don't want rows where there is a td of colspan 4

states_list = []
for tr in trs[4:226]:

    tds = tr.find_all('td')
    if(tds[0]):
        cells = []

    for td in tds:
        cells.append(td)

    states_list.append(cells)

In [77]:
states_list[10]

[<td style="vertical-align:top;"><span id="Austria"></span><b><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/23px-Flag_of_Austria.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/35px-Flag_of_Austria.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/45px-Flag_of_Austria.svg.png 2x" width="23"/> </span><a href="/wiki/Austria" title="Austria">Austria</a></b> – Republic of Austria
 </td>,
 <td><span style="display:none">A</span> UN member state
 </td>,
 <td><span style="display:none">A</span> None
 </td>,
 <td style="vertical-align:top;text-align:left;font-size:90%;">Member of the <a href="/wiki/European_Union" title="European Union">European Union</a>.<sup class="reference" id="cite_ref-EU_4-1"><a href="#cite_note-EU-4">[c]</a></sup> Austria is a <a href="/

This is the same information as the image above except in a BeautifulSoup object and not html. Identify what information might be useful apart from the text within each td tag.

- href link to wiki page
- href link to flag icon

Find those attributes in a single row using one row to help

In [78]:
# assign a list item to a variable (i.e. country_name = list[index])

austria = states_list[10]

In [79]:
# check the results
austria

[<td style="vertical-align:top;"><span id="Austria"></span><b><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/23px-Flag_of_Austria.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/35px-Flag_of_Austria.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/45px-Flag_of_Austria.svg.png 2x" width="23"/> </span><a href="/wiki/Austria" title="Austria">Austria</a></b> – Republic of Austria
 </td>,
 <td><span style="display:none">A</span> UN member state
 </td>,
 <td><span style="display:none">A</span> None
 </td>,
 <td style="vertical-align:top;text-align:left;font-size:90%;">Member of the <a href="/wiki/European_Union" title="European Union">European Union</a>.<sup class="reference" id="cite_ref-EU_4-1"><a href="#cite_note-EU-4">[c]</a></sup> Austria is a <a href="/

In [80]:
# wiki page
austria[0].find('a')['href']

'/wiki/Austria'

In [81]:
# flag link
austria[0].find('img')['src']

'//upload.wikimedia.org/wikipedia/commons/thumb/4/41/Flag_of_Austria.svg/23px-Flag_of_Austria.svg.png'

In [83]:
for sup in austria[3].find_all('sup'):
    print(sup)

<sup class="reference" id="cite_ref-EU_4-1"><a href="#cite_note-EU-4">[c]</a></sup>


### b. Other states

### a. Other states (table 2)
![image](../answers/assets/sovereign-states-other-tr-1.png)
![image](../answers/assets/sovereign-states-other-tr-2.png)

In [85]:
# create a list of lists of td tags for the second table
other_states_list = []
for tr in trs[230:-2]:
    tds = tr.find_all('td')
    cells = []
    for td in tds:
        cells.append(td)
    other_states_list.append(cells)

In [87]:
other_states_list[0]

[<td style="vertical-align:top;"><span id="Abkhazia"></span><b><span class="flagicon"><img alt="" class="thumbborder" data-file-height="300" data-file-width="600" decoding="async" height="12" src="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Flag_of_the_Republic_of_Abkhazia.svg/23px-Flag_of_the_Republic_of_Abkhazia.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Flag_of_the_Republic_of_Abkhazia.svg/35px-Flag_of_the_Republic_of_Abkhazia.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/7/7a/Flag_of_the_Republic_of_Abkhazia.svg/46px-Flag_of_the_Republic_of_Abkhazia.svg.png 2x" width="23"/> </span><a href="/wiki/Abkhazia" title="Abkhazia">Abkhazia</a></b> – Republic of Abkhazia
 </td>,
 <td style="background:LemonChiffon;"><span style="display:none">D</span> No membership
 </td>,
 <td style="background:LightCoral;"><span style="display:none">B </span>Claimed by <a href="#Georgia">Georgia</a><span style="display:none"> Claimed by <a href="#Korea_North

This is the same information as the image above except in a BeautifulSoup object and not html. Identify what information might be useful apart from the text within each td tag.

- Link to the flag?
- Citations?
- Recognized by?

Find those attributes in a single row 

In [90]:
# (i.e. abkhazia = list[index])
northern_cyprus = other_states_list[5]

In [91]:
# check the results
northern_cyprus

[<td style="vertical-align:top;"><span id="Northern_Cyprus"></span><b><span class="flagicon"><img alt="" class="thumbborder" data-file-height="600" data-file-width="900" decoding="async" height="15" src="//upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg/23px-Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg.png" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg/35px-Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg/45px-Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg.png 2x" width="23"/> </span><a href="/wiki/Northern_Cyprus" title="Northern Cyprus">Northern Cyprus</a></b> – Turkish Republic of Northern Cyprus
 </td>,
 <td style="background:LemonChiffon;"><span style="display:none">D</span> No membership
 </td>,
 <td style="background:LightCoral;"><spa

In [92]:
# citations?
for sup in northern_cyprus[3].find_all('sup'):
    print(sup)

<sup class="reference" id="cite_ref-97"><a href="#cite_note-97">[64]</a></sup>


In [93]:
# Link to the flag?
northern_cyprus[0].find('img')['src']


'//upload.wikimedia.org/wikipedia/commons/thumb/1/1e/Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg/23px-Flag_of_the_Turkish_Republic_of_Northern_Cyprus.svg.png'

In [107]:
# recognized by?
northern_cyprus[3].find_all('a')[1]

<a href="#Turkey">Turkey</a>

## 4. Extract the information from step 3 for each row
Create a header row based on the new columns created
### a. States

In [108]:
# apply to all rows the BeautifulSoup queries written for one row in 3a
# for each row in a list of trs
#     for each td in a list oftds

states_list_4 = []
# for each tr
for tr in trs[4:226]:
#     tr[0] is th first td tag
    tds = tr.find_all('td')
    try:
        c = tds[0]['colspan']
    except:
        new_row = [None, None, []]
        if(tds[0].find('img')):
            new_row[0] = tds[0].find('img')['src']
        if(tds[0].find('a')):
            new_row[1] = tds[0].find('a')['href']
        if(len(tds[3].find_all('sup')) > 0):        
            for sup in tds[3].find_all('sup'):
                new_row[2].append(sup['id'])
        for td in tds:
            new_row.append(td.text)
        states_list_4.append(new_row)

In [109]:
# create a dataframe from the list created above
states_df_4 = pd.DataFrame(states_list_4)

In [110]:
# rename the columns for the new dataframe
states_columns_4 = ['flag_img', 'wiki_link', 'citations', 'name', 'un_membership', 'dispute', 'more_info']
states_df_4.columns = states_columns_4

In [111]:
# check the results
states_df_4.head(5)

Unnamed: 0,flag_img,wiki_link,citations,name,un_membership,dispute,more_info
0,//upload.wikimedia.org/wikipedia/commons/thumb...,/wiki/Afghanistan,[],Afghanistan – Islamic Republic of Afghanistan\n,A UN member state\n,A None\n,\n
1,//upload.wikimedia.org/wikipedia/commons/thumb...,/wiki/Albania,[],Albania – Republic of Albania\n,A UN member state\n,A None\n,\n
2,//upload.wikimedia.org/wikipedia/commons/thumb...,/wiki/Algeria,[],Algeria – People's Democratic Republic of Alg...,A UN member state\n,A None\n,\n
3,//upload.wikimedia.org/wikipedia/commons/thumb...,/wiki/Andorra,[cite_ref-6],Andorra – Principality of Andorra\n,A UN member state\n,A None\n,Andorra is a co-principality in which the offi...
4,//upload.wikimedia.org/wikipedia/commons/thumb...,/wiki/Angola,[],Angola – Republic of Angola\n,A UN member state\n,A None\n,\n


### b. Other states

In [114]:
# apply to all rows the BeautifulSoup queries written for one row in 3a
# for each row in a list of trs
#     for each td in a list oftds

other_states_list_4 = []
# for each tr
for tr in trs[230:-2]:
#     tr[0] is th first td tag
    tds = tr.find_all('td')
    try:
        c = tds[0]['colspan']
    except:
        new_row = [None, None, []]
        new_row[0] = tds[1].get('style')
        if(len(str(tds[0]).split('–')) > 0):      
            new_row[1] = str(tds[0]).split('–')
        if(tds[2].find('a')):
            new_row[2] = str(tds[2]).split('</a>')
        for td in tds:
            new_row.append(td.text)
        other_states_list_4.append(new_row)

In [115]:
# create a dataframe from the list created above
other_states_df_4 = pd.DataFrame(other_states_list_4)

In [116]:
# rename the columns for the new dataframe
other_states_df_4.head(5)
other_states_df_4.columns = ['background_color', 'secondary_name', 'claimed_disputed', 'name', 'un_membership', 'notes', 'other_notes']

In [117]:
# check the results
other_states_df_4.head()

Unnamed: 0,background_color,secondary_name,claimed_disputed,name,un_membership,notes,other_notes
0,background:LemonChiffon;,"[<td style=""vertical-align:top;""><span id=""Abk...","[<td style=""background:LightCoral;""><span styl...",Abkhazia – Republic of Abkhazia\n,D No membership\n,B Claimed by Georgia Claimed by North Korea Cl...,"Recognised by Russia, Nauru, Nicaragua, Syria,..."
1,background:LemonChiffon;,"[<td style=""vertical-align:top;""><span id=""Art...","[<td style=""background:LightCoral;""><span styl...",Artsakh – Republic of Artsakh[ag]\n,D No membership\n,B Claimed by Georgia Claimed by North Korea Cl...,"A de facto independent state,[56][57][58] reco..."
2,background:lightgreen;,"[<td style=""vertical-align:top;""><span id=""Coo...","[<td><span style=""display:none"">A</span> None<...",Cook Islands\n,D Member of eight UN specialized agencies\n,A None(See political status)\n,"A state in free association with New Zealand, ..."
3,background:lightgreen;,"[<td style=""vertical-align:top;""><span id=""Kos...","[<td style=""background:LightCoral;""><span styl...",Kosovo – Republic of Kosovo\n,D Member of two UN specialized agencies\n,B Claimed by Georgia Claimed by North Korea Cl...,Pursuant to United Nations Security Council Re...
4,background:lightgreen;,"[<td style=""vertical-align:top;""><span id=""Niu...","[<td><span style=""display:none"">A</span> None<...",Niue\n,D Member of five UN specialized agencies\n,A None(See political status)\n,"A state in free association with New Zealand, ..."
