In [2]:
import pandas as pd
import numpy as np
import matplotlib as plt
import country_converter as coco
import seaborn as sns

pd.set_option("max_columns", None)

In [3]:
cc = coco.CountryConverter()

In [4]:
covid_df = pd.read_csv("data/covid.csv")
covid_df = covid_df[['Country/Region', 'Lat', 'Long', '5/25/20']]

#drop cruise ships
covid_df = covid_df.drop(48).drop(104)

#add country code column
countries = covid_df['Country/Region'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
covid_df['Country Code'] = standard_countries

covid_df = covid_df.rename(columns={'5/25/20': 'COVID-19 Cases', 'Country/Region': 'Country'})

covid_df = covid_df[['Country', 'Country Code',  'Lat', 'Long', 'COVID-19 Cases']]

In [5]:
#clean and merge malaria data

malaria_df = pd.read_csv('data/Malaria_WHO.csv')
malaria_df = malaria_df.rename(columns={"Country":"country", "Estimated number of malaria cases":"Malaria Cases"})

countries = malaria_df['country'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
malaria_df['Country Code'] = standard_countries

malaria_df = malaria_df.drop(['country'], axis=1)

df = pd.merge(covid_df, malaria_df, how='left', on='Country Code')

In [6]:
#clean and merge human development index data

hdi_df = pd.read_csv('data/hdi.csv', thousands=',')
countries = hdi_df['Country'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')

hdi_df['Country Code'] = standard_countries

hdi_df = hdi_df.drop('Country', axis=1)

df = pd.merge(df, hdi_df, how='left', on='Country Code')

In [7]:
#clean and merge world happiness index data

whi_df = pd.read_csv('data/whi.csv')

countries = whi_df['Country name'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
whi_df['Country Code'] = standard_countries

whi_df = whi_df.groupby('Country name', as_index=False).nth(-1)
whi_df = whi_df.drop(['Year', 'Country name'], axis=1)

df = pd.merge(df, whi_df, how='left', on='Country Code')

In [8]:
#clean and merge air travel data

air_df = pd.read_csv('data/air.csv')
air_df = air_df[['Country Code', '2018']]
air_df = air_df.rename(columns={'2018': 'Air Passengers Carried'})

df = pd.merge(df, air_df, how='left', on='Country Code')

In [9]:
#clean and merge testing data

test_df = pd.read_csv('data/tests.csv')
test_df = test_df.iloc[0:230]

countries = test_df['Country'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
test_df['Country Code'] = standard_countries

test_df = test_df[['Country Code', 'Tested', 'Positive', 'Positive/Tested %']]

test_df = test_df.rename(columns={'Tested': 'Tests Conducted', 'Positive': 'Positive Tests'})

df = pd.merge(df, test_df, how='left', on='Country Code')

In [10]:
#clean and merge hospital bed data

beds_df = pd.read_csv('data/hospital_beds.csv', thousands=',')

countries = beds_df['Country/territory'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
beds_df['Country Code'] = standard_countries

beds_df = beds_df[['Country Code', 'hosp_beds_per_1000_2017', 'Occupancy_percent']]
beds_df = beds_df.rename(columns={'hosp_beds_per_1000_2017': 'Hospital Beds/1000', 'Occupancy_percent': 'Occupancy%'})
beds_df.dtypes

df = pd.merge(df, beds_df, how='left', on='Country Code')



In [11]:
#clean and merge population data

pop_df = pd.read_csv('data/population.csv', thousands=',', na_values='N.A.')

countries = pop_df['Country'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
pop_df['Country Code'] = standard_countries

pop_df = pop_df[['Country Code', 'Population_2020', 'Density_KM2m', 'Migrants', 'Fertility_rate', 'Median_age', 'Urban_pop_pct']]

pop_df.columns = ['Country Code', 'Population', 'Population Density', 'Migrants', 'Fertility Rate', 'Median Age', 'Urban Population%']

pop_df['Urban Population%'] = pop_df['Urban Population%'].str.rstrip('%').astype('float') / 100.0

df = pd.merge(df, pop_df, how='left', on='Country Code')



In [12]:
#clean and merge quality of life index data

qol_df = pd.read_csv('data/qol.csv')

countries = qol_df['Country'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
qol_df['Country Code'] = standard_countries

qol_df = qol_df.drop(['Rank', 'Country'], axis=1)

df = pd.merge(df, qol_df, how='left', on='Country Code')

In [13]:
#clean and merge country weather data

weather_df = pd.read_csv('data/weather.csv')
weather_df = weather_df.groupby('Country/Region', as_index=False).mean()

weather_df = weather_df[['Country/Region', 'temp', 'min', 'max', 'stp', 'wdsp', 'prcp', 'fog']]

countries = weather_df['Country/Region'].tolist()
standard_countries = cc.convert(names=countries, to='ISO3')
weather_df['Country Code'] = standard_countries

weather_df = weather_df.drop(['Country/Region'], axis=1)

weather_df.columns = ['Avg Temp', 'Min Temp', 'Max Temp', 'Avg Pressure', 'Wind Speed', 'Precipitation', 'Fog', 'Country Code']

df = pd.merge(df, weather_df, how='left', on='Country Code')



In [14]:
#drop duplicates
df = df.drop_duplicates('Country Code')
df

Unnamed: 0,Country,Country Code,Lat,Long,COVID-19 Cases,Malaria Cases,Human development index (HDI) 2018,Life expectancy at birth 2018,Expected years of schooling 2018,Mean years of schooling 2018,Gross national income (GNI) per capita 2018,Life Ladder,Log GDP per capita,Social support,Healthy life expectancy at birth,Freedom to make life choices,Generosity,Perceptions of corruption,Positive affect,Negative affect,Confidence in national government,Democratic Quality,Delivery Quality,Standard deviation of ladder by country-year,Standard deviation/Mean of ladder by country-year,GINI index (World Bank estimate),"GINI index (World Bank estimate), average 2000-16","gini of household income reported in Gallup, by wp5-year","Most people can be trusted, Gallup","Most people can be trusted, WVS round 1981-1984","Most people can be trusted, WVS round 1989-1993","Most people can be trusted, WVS round 1994-1998","Most people can be trusted, WVS round 1999-2004","Most people can be trusted, WVS round 2005-2009","Most people can be trusted, WVS round 2010-2014",Air Passengers Carried,Tests Conducted,Positive Tests,Positive/Tested %,Hospital Beds/1000,Occupancy%,Population,Population Density,Migrants,Fertility Rate,Median Age,Urban Population%,Quality of Life Index,Purchasing Power Index,Safety Index,Health Care Index,Cost of Living Index,Property Price to Income Ratio,Traffic Commute Time Index,Pollution Index,Climate Index,Avg Temp,Min Temp,Max Temp,Avg Pressure,Wind Speed,Precipitation,Fog
0,Afghanistan,AFG,33.000000,65.000000,11173,630308.0,0.496,64.5,10.1,3.9,1746.0,2.694303,7.494588,0.507516,52.599998,0.373536,-0.084888,0.927606,0.424125,0.404904,0.364666,,,1.408344,0.522712,,,0.290681,,,,,,,,1722612.61,,,,,,38928346.0,60.0,-62920.0,4.6,18.0,0.25,,,,,,,,,,33.693443,25.444262,43.968852,816.267213,5.247541,1.925738,0.737705
1,Albania,ALB,41.153300,20.168300,1004,,0.791,78.5,15.2,10.1,12300.0,5.004403,9.412399,0.683592,68.699997,0.824212,0.005385,0.899129,0.713300,0.318997,0.435338,,,2.640531,0.527642,,0.30325,0.456174,,,,0.243243,0.232000,,,303137.00,12024.0,946.0,7.90,,,2877797.0,105.0,-14000.0,1.6,36.0,0.63,,,,,,,,,,52.313115,43.640984,62.755738,267.388525,2.103279,0.143770,0.344262
3,Algeria,DZA,28.033900,1.659600,8503,0.0,0.759,76.7,14.7,8.0,13639.0,5.043086,9.557952,0.798651,65.900002,0.583381,-0.172413,0.758704,0.591043,0.292946,,,,1.973943,0.391416,,0.27600,0.667872,,,,,0.107644,,0.179286,6442442.00,,,,,,43851044.0,18.0,-10000.0,3.1,29.0,0.73,,,,,,,,,,69.324590,55.801639,82.947541,999.900000,10.960656,0.000000,0.000000
4,Andorra,AND,42.506300,1.521800,763,,0.857,81.8,13.3,10.2,48641.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,77265.0,164.0,,,,0.88,,,,,,,,,,34.662295,25.162295,48.598361,857.677049,2.834426,0.044262,0.000000
5,Angola,AGO,-11.202700,17.873900,70,4615605.0,0.574,60.8,11.8,5.1,5555.0,3.794838,8.741481,0.754615,54.599998,0.374542,-0.157062,0.834076,0.578517,0.367864,0.572346,-0.739363,-1.168539,2.196711,0.578868,,0.47350,0.440699,,,,,,,,1516628.00,,,,,,32866272.0,26.0,6413.0,5.6,17.0,0.67,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
301,West Bank and Gaza,PSE,31.952200,35.233200,423,,0.690,73.9,12.8,9.1,5314.0,4.553922,,0.819479,,0.654535,,0.813780,0.610405,0.418929,0.392373,,,2.580389,0.566630,,,0.482421,,,,,,,0.158000,,34511.0,547.0,1.60,,,5101414.0,847.0,-10563.0,3.7,21.0,0.80,,,,,,,,,,,,,,,,
303,Western Sahara,ESH,24.215500,-12.885800,9,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,597339.0,2.0,5582.0,2.4,28.0,0.87,,,,,,,,,,,,,,,,
304,Yemen,YEM,15.552727,48.516388,233,762995.0,0.463,66.1,8.7,3.2,1433.0,3.057514,,0.789422,56.700001,0.552726,,0.792587,0.461114,0.314870,0.308151,,,2.402008,0.785608,,0.35700,0.448597,,,,,,,0.385000,336310.00,,,,,,29825964.0,56.0,-30000.0,3.8,20.0,0.38,,,,,,,,,,,,,,,,
305,Zambia,ZMB,-15.416700,28.283300,920,3475522.0,0.591,63.5,12.1,7.1,3582.0,4.041488,8.223958,0.717720,55.299999,0.790626,0.036644,0.810731,0.702698,0.350963,0.606715,,,2.783419,0.688711,,0.52740,0.619443,,,,,,0.110429,,8904.00,,,,,,18383955.0,25.0,-8000.0,4.7,18.0,0.45,,,,,,,,,,71.863934,64.203279,81.411475,999.900000,5.608197,21.384754,0.475410


In [15]:
df.to_csv('data/final_data.csv', index=False)