# The two-letter country code

The code was adapted from this [medium article](https://medium.com/analytics-vidhya/how-to-web-scrape-tables-online-using-python-and-beautifulsoup-36d5bafeb982)

In [1]:
import re
from bs4 import BeautifulSoup
import time
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_sovereign_states_and_dependent_territories_by_continent_(data_file)'
page = requests.get(url)

In [None]:
page.text

In [4]:
soup = BeautifulSoup(page.text, 'lxml')

In [5]:
results = soup.find_all('table', class_="wikitable sortable") # jquery-tablesorter")

In [6]:
type(results)

bs4.element.ResultSet

In [None]:
results[0]

In [8]:
headers = []
for th in results[0].find_all('th'):
    headers.append(th.text)
    
headers


['CC', 'a-2', 'a-3', '#', 'Name\n']

In [9]:
results[0].find('td')

<td>AS</td>

In [10]:

for tr in results[0].find_all('tr')[1:]:
    print(tr.text)
    break


AS
AF
AFG
004
Afghanistan, Islamic Republic of



In [11]:
tr.text.split()

['AS', 'AF', 'AFG', '004', 'Afghanistan,', 'Islamic', 'Republic', 'of']

In [12]:
data = {key : [] for key in headers[:2]}

In [13]:
data

{'CC': [], 'a-2': []}

In [14]:
for tr in results[0].find_all('tr')[1:]:
    #print(tr.text)
    row = tr.text.split()
    data['CC'].append(row[0])
    data['a-2'].append(row[1])
    
    
#data

In [15]:
df = pd.DataFrame.from_dict(data)

In [16]:
df.head()

Unnamed: 0,CC,a-2
0,AS,AF
1,EU,AL
2,AN,AQ
3,AF,DZ
4,OC,AS


In [17]:
df.CC.value_counts()

AF    60
EU    58
AS    57
NA    42
OC    27
SA    14
AN     4
Name: CC, dtype: int64

In [18]:
maper = {'AF':'Africa', 'AN':'Antarctica', 'AS':'Asia' , 'EU':'Europe' , 'NA':'North america','OC':'Oceania', 'SA':'South america' }

In [19]:
maper

{'AF': 'Africa',
 'AN': 'Antarctica',
 'AS': 'Asia',
 'EU': 'Europe',
 'NA': 'North america',
 'OC': 'Oceania',
 'SA': 'South america'}

In [20]:
df['continent'] = df['CC'].map(maper)

In [21]:
df.continent.value_counts()

Africa           60
Europe           58
Asia             57
North america    42
Oceania          27
South america    14
Antarctica        4
Name: continent, dtype: int64

In [22]:
df.to_csv('data/continent_country_code.csv' , index=False)

In [23]:
df = pd.read_csv('data/continent_country_code.csv', keep_default_na=False)

In [24]:
df.CC.value_counts()

AF    60
EU    58
AS    57
NA    42
OC    27
SA    14
AN     4
Name: CC, dtype: int64

In [25]:
df.continent.value_counts()

Africa           60
Europe           58
Asia             57
North america    42
Oceania          27
South america    14
Antarctica        4
Name: continent, dtype: int64