In [1]:
def country_regularizer(country):
    if country in ["United States of America (the)", "US", "USA"]:
        return "United States"
    elif country in ["Dem. People's Republic of Korea"]:
        return "South Korea"
    elif country in ["Congo (Kinshasa)", "DRC"]:
        return "Democratic Republic of the Congo"
    elif country in ["Congo (Brazzaville)", "Congo"]:
        return "Republic of the Congo"
    elif country in ["\"Bahamas, The\"", "Bahamas"]:
        return "The Bahamas"
    elif country in ["Curaçao"]:
        return "Curacao"
    elif country in ["Gambia"]:
        return "The Gambia"
    elif country in ["Republic of Ireland"]:
        return "Ireland"
    elif country in ["UAE"]:
        return "United Arab Emirates"
    elif country in ["Turks and Caicos"]:
        return "Turks and Caicos Islands"
    elif country in ["French Guinea"]:
        return "Guinea"
    elif country in ["Portuguese Guinea"]:
        return "Guinea-Bissau"
    elif country in ["Spanish Guinea"]:
        return "Equatorial Guinea"
    elif country in ["St. Martin"]:
        return "Saint Martin"
    elif country in ["Timor-Leste"]:
        return "East Timor"
    else:
        return country

In [2]:
import pandas as pd
import os

#################
### Happiness ###
#################
dir_name = "Original_Data/happiness/"
files = os.listdir(dir_name)
happiness_index = {}
for file_name in files:
    year = file_name[:-4]
    if year in ["2020"]:
        subset = ['Country name', 'Ladder score']
    elif year in ["2019", "2018"]:
        subset = ['Country or region', 'Score']
    elif year in ['2017']:
        subset = ['Country', 'Happiness.Score']
    elif year in ['2016', '2015']:
        subset = ['Country', 'Happiness Score']
    else:
        print("No data for year", year)
    happiness_index[year] = pd.read_csv(dir_name + file_name).dropna(subset = subset)

with open('processed/Happiness.csv', 'w') as f:
    #print("country,year,score", file = f)
    for year in happiness_index.keys():
        tab = happiness_index[year]
        for instance in happiness_index[year].values.tolist():
            if year in ["2020", "2017"]:
                print("%s,%s,%.2f" % (country_regularizer(instance[0]), year, instance[2]), file = f)
            elif year in ["2019", "2018"]:
                print("%s,%s,%.2f" % (country_regularizer(instance[1]), year, instance[2]), file = f)
            elif year in ["2016", "2015"]:
                print("%s,%s,%.2f" % (country_regularizer(instance[0]), year, instance[3]), file = f)

In [8]:
###############
### Country ###         
###############
country_info = {}
country_list = []
GDP = pd.read_csv("Original_Data/country_info/" + "GDP_per_capita.csv").dropna(subset = ['Country', 'GDP per capita']).values.tolist()
for instance in GDP:
    country_info[instance[0]] = {"GDP": instance[1]}

life_exp = pd.read_csv("Original_Data/country_info/" + "Life_expectancy.csv").dropna(subset = ['Country', 'Life expectancy']).values.tolist()
for instance in life_exp:
    if instance[0] in country_info.keys():
        country_info[instance[0]]["life_exp"] = instance[1]
    else:
        country_info[instance[0]] = {"life_exp": instance[1]}
        #print("creating new key in country_info:", instance[0])

med_age = pd.read_csv("Original_Data/country_info/" + "Median_age.csv").dropna(subset = ['Country', 'Median age']).values.tolist()
for instance in med_age:
    if instance[0] in country_info.keys():
        country_info[instance[0]]["med_age"] = instance[1]
    else:
        country_info[instance[0]] = {"med_age": instance[1]}
        #print("creating new key in country_info:", instance[0])

with open("processed/Country.csv", 'w') as f:
    #print("country,med_age,life_expectancy,gdp", file = f)
    for country in country_info.keys():
        if "med_age" in country_info[country].keys():
            med_age = country_info[country]["med_age"]
        else:
            med_age = float('nan')
            
        if "life_exp" in country_info[country].keys():
            life_exp = country_info[country]["life_exp"]
        else:
            life_exp = float('nan')
            
        if "GDP" in country_info[country].keys():
            GDP = country_info[country]["GDP"]
        else:
            GDP = float('nan')
            
        print("%s,%.1f,%.1f,%.1f" % (country, med_age, life_exp, GDP), file = f)
        #print(country)
        country_list.append(country)

In [38]:
len(country_list)

224

In [41]:
import json
with open("processed/country_list.json", "w") as f:
    json.dump(country_list, f)

In [4]:
###################          
### Case_Gender ###            
###################
with open("processed/Case_Gender.csv", 'w') as f:
    gender = pd.read_csv("Original_Data/gender.csv").dropna(subset = ["Country", "Cases (% male)", "Cases (% female)"]).values.tolist()
    #print("country,female_percent,male_percent", file = f)
    for instance in gender:
        print("%s,%s,%s" % (country_regularizer(instance[1]), instance[5], instance[6]), file = f)

In [5]:
###################        
### Air_Traffic ###
###################
with open("processed/Air_Traffic.csv", 'w') as f:
    air_traffic = pd.read_csv("Original_Data/covid_impact_on_airport_traffic.csv").dropna(subset = ['Date', 'AirportName']).values.tolist()
    #print("airport,date,country,state,city,baseline_percent", file = f)
    for instance in air_traffic:
        print("%s,%s,%s,%s,%s,%d" % (instance[3], instance[1], country_regularizer(instance[9]), instance[7], instance[6], instance[4]), file = f)

In [49]:
#############
### COVID ###
#############
count_skip = 0
country_covid = []
with open("processed/COVID.csv", 'w') as f:
    covid = pd.read_csv("Original_Data/covid_19_all.csv").dropna(subset = ['Country/Region']).values.tolist()
    #print("country,province,date,longitude,latitude,confirmed,death,recovered", file = f)
    for instance in covid:
        MM = instance[7].split("/")[0]
        DD = instance[7].split("/")[1]
        YYYY = "20" + instance[7].split("/")[2]
        print("%s,%s,%s,%.2f,%.2f,%.f,%.f,%.f" % (country_regularizer(instance[0]), delete_comma(instance[1]), YYYY + "-" + MM + "-" + DD, instance[3], instance[2], instance[4], instance[6], instance[5]), file = f)
        if country_regularizer(instance[0]) not in country_list:
            count_skip += 1
        if country_regularizer(instance[0]) not in country_covid:
            country_covid.append(country_regularizer(instance[0]))
    print(count_skip)
    print(len(country_covid))

1687
212


In [36]:
##################
### First_Case ###
##################
import datetime
def delete_comma(instance):
    if isinstance(instance, str):
        return instance.replace(",", " ")
    else:
        return instance
with open("processed/First_Case_First_Dead.csv", "w") as f:
    first_case = pd.read_csv("Original_Data/Covid-19_world-wide_Dataset.csv",encoding= 'unicode_escape').dropna(subset = ['Continent', 'Country', 'Date of First Case(s)']).values.tolist()
    #print("continent,country,date_of_first_case,last_visited_country,confirmed_case_at_first_day,age_of_first_case,date_of_first_death,age_of_first_death", file = f)
    for instance in first_case:
        datetime_object = datetime.datetime.strptime(instance[2].split(" ")[1], "%B")
        MM = str(datetime_object.month)
        DD = instance[2].split(" ")[0]
        YYYY = instance[2].split(" ")[2]
        print("%s,%s,%s,%s,%s,%s,%s,%s" % (country_regularizer(instance[0]), instance[1], YYYY + "-" + MM + "-" + DD, delete_comma(instance[3]), delete_comma(instance[4]), delete_comma(instance[5]), delete_comma(instance[6]), delete_comma(instance[7])), file = f)

In [46]:
##############
### LatLng ###
##############
import json
with open("processed/LatLng.csv", "w") as f:
    with open("Original_Data/latitude_longtitude_country.json") as file:
        latlng = json.load(file)
        #print("country,latitude,longitude")
        for instance in latlng:
            if country_regularizer(instance['name']) in country_list:
                print("%s,%i,%i" % (country_regularizer(instance['name']), instance['latlng'][0], instance['latlng'][1]), file = f)

In [None]:
GDP = pd.read_csv("Original_Data/country_info/" + "GDP_per_capita.csv")#.dropna(subset = ['Country', 'GDP per capita'])
life_exp = pd.read_csv("Original_Data/country_info/" + "Life_expectancy.csv")#.dropna(subset = ['Country', 'Life expectancy'])
med_age = pd.read_csv("Original_Data/country_info/" + "Median_age.csv")#.dropna(subset = ['Country', 'Median age'])

In [None]:
len(sorted(list(set(GDP["Country"].unique().tolist() + med_age["Country"].unique().tolist() + life_exp["Country"].unique().tolist()))))

In [None]:
len(sorted(list(set(GDP["Country"].unique().tolist() + med_age["Country"].unique().tolist() + life_exp["Country"].unique().tolist()))))