In [1]:
import numpy as np
import pandas as pd

In [2]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 0)

In [3]:
data_path = r'C:\Users\DucTRung\Desktop\country_by_freedom_indexes_in_covid19'

# Covid-19

In [86]:
covid_f = pd.read_csv(data_path + '\\raw_data\\covid19_ds.csv')

In [87]:
covid_f

Unnamed: 0,Country,Total_cases,Total_deaths,Total_recovered,Total_tests,Cases/1m,Deaths/1m,Tests/1m
0,USA,1238052,72284,200669,7727938,3740,218,23347
1,Spain,250561,25613,154718,1932455,5359,548,41332
2,Italy,213013,29315,85231,2246666,3523,485,37158
3,UK,194990,29427,,1383842,2872,433,20385
4,France,170551,25531,52736,1100228,2613,391,16856
...,...,...,...,...,...,...,...,...
209,St. Barth,6,,6,,607,,
210,Western Sahara,6,,5,,10,,
211,Anguilla,3,,3,,200,,
212,Comoros,3,,,,3,,


## Rename features

In [88]:
covid_f.rename(columns={'Country': 'country',
                        'Total_cases': 'total_cases',
                        'Total_deaths': 'total_deaths',
                        'Total_recovered': 'total_recovered',
                        'Total_tests': 'total_tests',
                        'Cases/1m': 'cases/1m',
                        'Deaths/1m':'deaths/1m',
                        'Tests/1m': 'tests/1m'}, inplace=True)

## Covert 'object' to 'numerical'

In [92]:
covid_f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 214 entries, 0 to 213
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   country          214 non-null    object 
 1   total_cases      214 non-null    float64
 2   total_deaths     176 non-null    float64
 3   total_recovered  206 non-null    float64
 4   total_tests      181 non-null    float64
 5   cases/1m         212 non-null    float64
 6   deaths/1m        174 non-null    float64
 7   tests/1m         181 non-null    float64
dtypes: float64(7), object(1)
memory usage: 13.5+ KB


In [90]:
def convert_float(data):
    if data is not np.nan:
        return data.replace(',', '')
    else:
        return data

In [91]:
covid_f['total_cases'] = covid_f['total_cases'].apply(convert_float).astype('float')
covid_f['total_deaths'] = covid_f['total_deaths'].apply(convert_float).astype('float')
covid_f['total_recovered'] = covid_f['total_recovered'].apply(convert_float).astype('float')
covid_f['total_tests'] = covid_f['total_tests'].apply(convert_float).astype('float')
covid_f['cases/1m'] = covid_f['cases/1m'].apply(convert_float).astype('float')
covid_f['deaths/1m'] = covid_f['deaths/1m'].apply(convert_float).astype('float')
covid_f['tests/1m'] = covid_f['tests/1m'].apply(convert_float).astype('float')

## Save

In [93]:
covid_f.to_csv(data_path + '\\wrangled_frame\\covid_f.csv', index=False)

# Press Freedom Index 2020

In [18]:
pfi_f = pd.read_csv(data_path + '\\raw_data\\press_freedom_index_2020.csv')

## Filter right features

In [14]:
pfi_f.columns

Index(['ISO', 'Rank2020', 'FR_Country', 'EN_country', 'ES_country', 'Score A',
       'Sco Exa', 'Score 2020', 'Progression RANK', 'Rank 2019', 'Score 2019',
       'Zone', 'AR_country', 'FA_country'],
      dtype='object')

In [19]:
pfi_f = pfi_f[['EN_country', 'Score 2020']]

## Rename features

In [20]:
pfi_f.rename(columns={'EN_country': 'country',
                      'Score 2020': 'press_freedom_index_2020'}, inplace=True)

## Checking 'country' feature

In [21]:
pfi_f.set_index(pfi_f['country'].isin(covid_f['country'])).loc[False]

Unnamed: 0_level_0,country,press_freedom_index_2020
country,Unnamed: 1_level_1,Unnamed: 2_level_1
False,Cape Verde,2015
False,Cyprus North,2979
False,Lesotho,3045
False,Macedonia,3128
False,Democratic People's Republic of Korea,8582
False,Swaziland,4515
False,Turkmenistan,8544
False,Tonga,2727
False,Samoa,1825
False,OECS,2378


## Correct name of country

In [43]:
pfi_f.loc[pfi_f['country'] == 'DRC', 'country'] = 'DR Congo'

## Convert format of 'press_freedom_index_2020'

In [59]:
replace = lambda x: x.replace(',', '.')

In [60]:
pfi_f['press_freedom_index_2020'] = pfi_f['press_freedom_index_2020'].apply(replace).astype('float')

AttributeError: 'float' object has no attribute 'replace'

## Save

In [25]:
pfi_f.to_csv(data_path + '\\wrangled_frame\\pfi_f.csv', index=False)

# Country by Region

In [29]:
region_f = pd.read_csv(data_path + '\\raw_data\\country_by_region.csv')

## Filter features

In [30]:
region_f = region_f[['country', 'region']]

## Checking merge_key: 'country'

In [31]:
region_f.set_index(region_f['country'].isin(covid_f['country'])).loc[False]

Unnamed: 0_level_0,country,region
country,Unnamed: 1_level_1,Unnamed: 2_level_1
False,North Korea,Eastern Asia
False,Turkmenistan,Central Asia
False,Lesotho,Southern Africa
False,Swaziland,Southern Africa
False,Puerto Rico,Caribbean
False,United States Virgin Islands,Caribbean
False,Solomon Islands,Melanesia
False,Vanuatu,Melanesia
False,Samoa,Polynesia
False,Tonga,Polynesia


## Save

In [32]:
region_f.to_csv(data_path + '\\wrangled_frame\\region_f.csv', index=False)

# Country by Continent

In [35]:
continent_f = pd.read_csv(data_path + '\\raw_data\\country_by_continent.csv')

## Checking merge_key: 'country'

In [36]:
continent_f.set_index(continent_f['country'].isin(covid_f['country'])).loc[False]

KeyError: False

## Correct name of country

In [103]:
continent_f.loc[continent_f['country'] == 'DRC', 'country'] = 'DR Congo'

## Save

In [37]:
continent_f.to_csv(data_path + '\\wrangled_frame\\countinent_f.csv', index=False)

# Handle Missing Values

## Covid_f

In [109]:
covid_f.isnull().sum()

country             0
total_cases         0
total_deaths       38
total_recovered     8
total_tests        33
cases/1m            2
deaths/1m          40
tests/1m           33
dtype: int64

# Merging

## Left set

In [120]:
merged_f.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 169 entries, 0 to 168
Data columns (total 11 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   country                   169 non-null    object 
 1   total_cases               169 non-null    float64
 2   total_deaths              150 non-null    float64
 3   total_recovered           164 non-null    float64
 4   total_tests               150 non-null    float64
 5   cases/1m                  169 non-null    float64
 6   deaths/1m                 150 non-null    float64
 7   tests/1m                  150 non-null    float64
 8   press_freedom_index_2020  169 non-null    float64
 9   region                    169 non-null    object 
 10  continent                 169 non-null    object 
dtypes: float64(8), object(3)
memory usage: 15.8+ KB


## Right set

In [118]:
continent_f.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 212 entries, 0 to 211
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   country    212 non-null    object
 1   continent  212 non-null    object
dtypes: object(2)
memory usage: 3.4+ KB


## Merging

In [119]:
merged_f = pd.merge(merged_f, continent_f, on='country', how='inner')

## Save

In [121]:
merged_f.to_csv(data_path + '\\wrangled_frame\\wrangled_f_5.csv', index=False)