In [1]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd

In [2]:
url = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)'
html = requests.get(url).content

In [3]:
# lxml is the parsing module
soup = BeautifulSoup(html, 'lxml')

In [4]:
rows = soup.find_all('tr')
rows_parsed = [row.text for row in rows]

In [5]:
def parser(row_text):
    row_text = row_text.replace('\n\n', '\n').strip('\n')
    row_text = re.sub('\[\d\]', '', row_text)
    return list(map(lambda x: x.strip(), row_text.split('\n')))

gdp_parsed = list(map(lambda x: parser(x), rows_parsed))

In [6]:
colnames_country = gdp_parsed[199]
data_country = gdp_parsed[200:391]

df_gdp = pd.DataFrame(data_country, columns=colnames_country)
df_gdp

Unnamed: 0,Rank,Country/Territory,GDP(US$million)
0,,World,85804391
1,1,United States,20494100
2,2,China[n 5],13608152
3,3,Japan,4970916
4,4,Germany,3996759
...,...,...,...
186,181,Palau,310
187,182,Marshall Islands,212
188,183,Kiribati,188
189,184,Nauru,115


In [7]:
def country_cleaning(c):
    return re.sub("\[.*\].*", "", c)

def gdp_cleaning(gdp):
    return re.sub(",|\(.*\).*", "", gdp)

def gdp_to_number(gdp):
    return int(gdp)

df_gdp['country'] = df_gdp['Country/Territory'].apply(lambda x: re.sub("\[.*\].*", "", x))
df_gdp['GDP_BUSD'] = df_gdp['GDP(US$million)'].apply(lambda x: int(re.sub(",|\(.*\).*", "", x)))
columns = ['country', 'GDP_BUSD']
df_gdpcountry = df_gdp[columns]
df_gdpcountry.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 2 columns):
country     191 non-null object
GDP_BUSD    191 non-null int64
dtypes: int64(1), object(1)
memory usage: 3.1+ KB


In [8]:
url_percapita = 'https://en.wikipedia.org/wiki/List_of_countries_by_GDP_(nominal)_per_capita'
html_percapita = requests.get(url_percapita).content

soup_percapita = BeautifulSoup(html_percapita, 'lxml')

rows_percapita = soup_percapita.find_all('tr')
rows_parsed_percapita = [row.text for row in rows_percapita]

def parser_percapita(row_text):
    row_text = row_text.replace('\n\n', '\n').strip('\n')
    row_text = re.sub('\[\d\]', '', row_text)
    return list(map(lambda x: x.strip(), row_text.split('\n')))

parsed_percapita = list(map(lambda x: parser_percapita(x), rows_parsed_percapita))


colnames_percapita = parsed_percapita[197]
data_percapita = parsed_percapita[198:386]


df_percapita = pd.DataFrame(data_percapita, columns=colnames_percapita)


df_percapita['country'] = df_percapita['Country/Territory'].apply(lambda x: re.sub("\[.*\].*", "", x))
df_percapita['GDP_per_capita_USD'] = df_percapita['US$'].apply(lambda x: int(re.sub(",|\(.*\).*", "", x)))
columns = ['country', 'GDP_per_capita_USD']
df_percapita_clean = df_percapita[columns]
df_percapita_clean

Unnamed: 0,country,GDP_per_capita_USD
0,Luxembourg,114341
1,Macau,86365
2,Switzerland,82839
3,Norway,81807
4,Ireland,77450
...,...,...
183,Somalia,499
184,Madagascar,461
185,Burundi,412
186,Niger,389


In [9]:
df_mergegdp = pd.merge(df_gdpcountry, df_percapita_clean, on='country')
df_mergegdp

Unnamed: 0,country,GDP_BUSD,GDP_per_capita_USD
0,World,85804391,11297
1,United States,20494100,62641
2,China,13608152,9771
3,Japan,4970916,39287
4,Germany,3996759,48196
...,...,...,...
173,Palau,310,17318
174,Marshall Islands,212,3621
175,Kiribati,188,1625
176,Nauru,115,9030


In [10]:
df_forbes = pd.read_csv(f'../data/processed/forbeslist_withcountry.csv')

In [15]:
df_forbesgdp = pd.merge(df_forbes, df_mergegdp, on='country')
columns = ['name', 'gender', 'age', 'worth_amount_(BUSD)', 'country', 'company_sector', 'company_name', 'GDP_per_capita_USD']
df_definitive = df_forbesgdp[columns]
df_definitive.to_csv(f'../data/processed/definitive_dataframe.csv')

In [16]:
df_definitive.head()

Unnamed: 0,name,gender,age,worth_amount_(BUSD),country,company_sector,company_name,GDP_per_capita_USD
0,Jeff Bezos,M,54.0,112.0,United States,Technology,Amazon,62641
1,Bill Gates,M,62.0,90.0,United States,Technology,Microsoft,62641
2,Warren Buffett,M,87.0,84.0,United States,Finance and Investments,Berkshire Hathaway,62641
3,Mark Zuckerberg,M,33.0,71.0,United States,Technology,Facebook,62641
4,Charles Koch,M,82.0,60.0,United States,Diversified,Koch Industries,62641
