In [50]:
import pandas as pd
import numpy as np

pd.set_option('display.expand_frame_repr', False)

Read in the data containing death numbers per country

In [51]:
deaths_by_country_df = pd.read_csv('data/hiv_aids_data/hiv_aids_death_data.csv')

Lets check out the first few rows to get a feel for how the data looks

In [52]:
deaths_by_country_df.head()

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueUoM,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,FactComments,Language,DateModified
0,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,ALB,Albania,Year,2020,...,,<,100.0,<,100.0,<100 [<100 – <100],,,EN,2021-10-11 00:00:00
1,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,ARM,Armenia,Year,2020,...,,<,100.0,<,100.0,<100 [<100 – <100],,,EN,2021-10-11 00:00:00
2,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,WPR,Western Pacific,Country,AUS,Australia,Year,2020,...,,<,100.0,<,100.0,<100 [<100 – <100],,,EN,2021-10-11 00:00:00
3,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,AFR,Africa,Country,CPV,Cabo Verde,Year,2020,...,,<,100.0,<,100.0,<100 [<100 – <100],,,EN,2021-10-11 00:00:00
4,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,AFR,Africa,Country,COM,Comoros,Year,2020,...,,<,100.0,<,100.0,<100 [<100 – <100],,,EN,2021-10-11 00:00:00


Looking more closely at the data locally as a csv a lot of these columns have no values. Lets drop all the columns with no values

In [53]:
deaths_by_country_df.dropna(axis=1, how='all', inplace=True)
deaths_by_country_df

Unnamed: 0,IndicatorCode,Indicator,ValueType,ParentLocationCode,ParentLocation,Location type,SpatialDimValueCode,Location,Period type,Period,...,FactValueNumericPrefix,FactValueNumeric,FactValueNumericLowPrefix,FactValueNumericLow,FactValueNumericHighPrefix,FactValueNumericHigh,Value,FactValueTranslationID,Language,DateModified
0,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,ALB,Albania,Year,2020,...,<,100.0,<,100.0,<,100.0,<100 [<100 – <100],,EN,2021-10-11 00:00:00
1,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,ARM,Armenia,Year,2020,...,<,100.0,<,100.0,<,100.0,<100 [<100 – <100],,EN,2021-10-11 00:00:00
2,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,WPR,Western Pacific,Country,AUS,Australia,Year,2020,...,<,100.0,<,100.0,<,100.0,<100 [<100 – <100],,EN,2021-10-11 00:00:00
3,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,AFR,Africa,Country,CPV,Cabo Verde,Year,2020,...,<,100.0,<,100.0,<,100.0,<100 [<100 – <100],,EN,2021-10-11 00:00:00
4,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,AFR,Africa,Country,COM,Comoros,Year,2020,...,<,100.0,<,100.0,<,100.0,<100 [<100 – <100],,EN,2021-10-11 00:00:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3607,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,SWE,Sweden,Year,2000,...,,,,,,,No data,827.0,EN,2021-10-11 00:00:00
3608,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,TUR,Turkey,Year,2000,...,,,,,,,No data,827.0,EN,2021-10-11 00:00:00
3609,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,TKM,Turkmenistan,Year,2000,...,,,,,,,No data,827.0,EN,2021-10-11 00:00:00
3610,HIV_0000000006,Number of deaths due to HIV/AIDS,numeric,EUR,Europe,Country,GBR,United Kingdom of Great Britain and Northern I...,Year,2000,...,,,,,,,No data,827.0,EN,2021-10-11 00:00:00


The IndicatorCode, ValueType, ParentsLocationCode,ParentLocation,LocationType,Period type,IsLastYear,Language will all be useless dropping them

In [54]:
deaths_by_country_df.drop(
    columns=['IndicatorCode', 'ValueType', 'ParentLocationCode', 'ParentLocation', 'Location type', 'Period type',
             'IsLatestYear', 'Language', 'DateModified', 'FactValueTranslationID', 'FactValueNumericPrefix',
             'FactValueNumericLowPrefix', 'FactValueNumericLowPrefix', 'FactValueNumericHighPrefix',
             'FactValueNumericHigh', 'Value'], axis=1, inplace=True)

There is still missing data for the value which is the most important column for analysis, dropping all the rows where the value is missing.

In [55]:
deaths_by_country_df.dropna(subset='FactValueNumeric', axis=0, inplace=True)
deaths_by_country_df

Unnamed: 0,Indicator,SpatialDimValueCode,Location,Period,FactValueNumeric,FactValueNumericLow
0,Number of deaths due to HIV/AIDS,ALB,Albania,2020,100.0,100.0
1,Number of deaths due to HIV/AIDS,ARM,Armenia,2020,100.0,100.0
2,Number of deaths due to HIV/AIDS,AUS,Australia,2020,100.0,100.0
3,Number of deaths due to HIV/AIDS,CPV,Cabo Verde,2020,100.0,100.0
4,Number of deaths due to HIV/AIDS,COM,Comoros,2020,100.0,100.0
...,...,...,...,...,...,...
3565,Number of deaths due to HIV/AIDS,NAM,Namibia,2000,7600.0,7000.0
3566,Number of deaths due to HIV/AIDS,MLI,Mali,2000,8600.0,6200.0
3567,Number of deaths due to HIV/AIDS,TZA,United Republic of Tanzania,2000,86000.0,78000.0
3568,Number of deaths due to HIV/AIDS,UGA,Uganda,2000,89000.0,82000.0


After getting rid of all of those values there are only 131 countries

In [56]:
print(deaths_by_country_df.Location.unique())
len(deaths_by_country_df.Location.unique())

['Albania' 'Armenia' 'Australia' 'Cabo Verde' 'Comoros' 'Croatia'
 'Denmark' 'Estonia' 'Fiji' 'Georgia' 'Greece' 'Iceland' 'Ireland' 'Japan'
 'Jordan' 'Lebanon' 'Libya' 'Mongolia' 'New Zealand' 'Norway' 'Qatar'
 'Sao Tome and Principe' 'Serbia' 'Singapore' 'Syrian Arab Republic'
 'Timor-Leste' 'United Arab Emirates' 'Bhutan' 'Kyrgyzstan' 'Netherlands'
 'Montenegro' 'Azerbaijan' 'Guyana' 'Saudi Arabia' 'Sri Lanka' 'Suriname'
 'Uruguay' 'Belize' 'Switzerland' 'Tunisia' 'Algeria' 'Belarus'
 'Bolivia (Plurinational State of)' 'Trinidad and Tobago' 'Kazakhstan'
 'Nicaragua' 'Romania' 'Eritrea' 'Djibouti' 'Costa Rica' 'Egypt'
 "Lao People's Democratic Republic" 'Portugal' 'Tajikistan' 'Yemen'
 'Morocco' 'Somalia' 'Panama' 'Republic of Moldova' 'Cuba' 'Mauritania'
 'Paraguay' 'Germany' 'Papua New Guinea' 'Ecuador' 'Niger' 'Senegal'
 'Cambodia' 'Liberia' 'Gambia' 'Argentina' 'Guinea-Bissau' 'Burundi'
 'Madagascar' 'Dominican Republic' 'Malawi' 'Thailand' 'Ghana' 'Brazil'
 'Ethiopia' 'Cameroon'

131

Lets write the finished dataset to a csv

In [None]:
def init_data(csv):
    if "swine_flu_clean" in csv:
        education = pd.read_csv(csv, skiprows = 4)
        return education