In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
%matplotlib inline

import os
import glob

# Part I: Process influenza cases data
## 1 Combine all csv files

In [2]:
os.chdir("./Influenza_165")

In [3]:
# set extension
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]

# combine all files in the list
combined_csv = pd.concat([pd.read_csv(f, header = 2, engine="python") for f in all_filenames], sort=False)

# export to csv
combined_csv.to_csv("a_combined_data.csv", index=False, encoding='utf-8-sig')

# reference: https://www.freecodecamp.org/news/how-to-combine-multiple-csv-files-with-8-lines-of-code-265183e0854/

In [4]:
# read df
df = pd.read_csv("a_combined_data.csv", header = 0, low_memory=False)
df.head()

Unnamed: 0,Country,WHOREGION,FLUREGION,Year,Week,SDATE,EDATE,SPEC_RECEIVED_NB,SPEC_PROCESSED_NB,AH1,...,AH5,ANOTSUBTYPED,INF_A,BYAMAGATA,BVICTORIA,BNOTDETERMINED,INF_B,ALL_INF,ALL_INF2,TITLE
0,Afghanistan,Eastern Mediterranean Region of WHO,Southern Asia,2005,1,2005-01-03,2005-01-09,,,,...,,,,,,,,,,No Report
1,Afghanistan,Eastern Mediterranean Region of WHO,Southern Asia,2005,2,2005-01-10,2005-01-16,,,,...,,,,,,,,,,No Report
2,Afghanistan,Eastern Mediterranean Region of WHO,Southern Asia,2005,3,2005-01-17,2005-01-23,,,,...,,,,,,,,,,No Report
3,Afghanistan,Eastern Mediterranean Region of WHO,Southern Asia,2005,4,2005-01-24,2005-01-30,,,,...,,,,,,,,,,No Report
4,Afghanistan,Eastern Mediterranean Region of WHO,Southern Asia,2005,5,2005-01-31,2005-02-06,,,,...,,,,,,,,,,No Report


In [5]:
df.columns

Index(['Country', 'WHOREGION', 'FLUREGION', 'Year', 'Week', 'SDATE', 'EDATE',
       'SPEC_RECEIVED_NB', 'SPEC_PROCESSED_NB', 'AH1', 'AH1N12009', 'AH3',
       'AH5', 'ANOTSUBTYPED', 'INF_A', 'BYAMAGATA', 'BVICTORIA',
       'BNOTDETERMINED', 'INF_B', 'ALL_INF', 'ALL_INF2', 'TITLE'],
      dtype='object')

## 2 Calculate the influenza cases by country and by year 

In [6]:
# subset only useful columns
df = df[['Country', 'Year', 'ALL_INF']]
df.head(2)

Unnamed: 0,Country,Year,ALL_INF
0,Afghanistan,2005,
1,Afghanistan,2005,


In [7]:
df_orig = df.copy()
df = df.fillna(0)
df.head(2)

Unnamed: 0,Country,Year,ALL_INF
0,Afghanistan,2005,0.0
1,Afghanistan,2005,0.0


In [8]:
# apply group by function to the dataframe
df_allinf = df.groupby(['Country', 'Year'])['ALL_INF'].sum()
df_allinf = pd.DataFrame(df_allinf).reset_index()
df_allinf.head(2)

Unnamed: 0,Country,Year,ALL_INF
0,Afghanistan,2005,0.0
1,Afghanistan,2006,0.0


In [9]:
df_allinf.isnull().any()

Country    False
Year       False
ALL_INF    False
dtype: bool

In [10]:
# Pivot table
df_allinf = df_allinf.pivot(index = 'Country', columns = 'Year',
                            values = 'ALL_INF').add_prefix('Influenza').reset_index()
df_allinf.head(2)

Year,Country,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019
0,Afghanistan,0.0,0.0,0.0,0.0,283.0,26.0,5.0,2.0,0.0,0.0,13.0,257.0,108.0,187.0,278.0
1,Albania,0.0,0.0,0.0,0.0,0.0,10.0,133.0,122.0,97.0,157.0,67.0,314.0,136.0,350.0,527.0


In [11]:
len(df_allinf.Country.unique())

165

In [12]:
df_allinf.isnull().any()

Year
Country          False
Influenza2005     True
Influenza2006     True
Influenza2007     True
Influenza2008     True
Influenza2009     True
Influenza2010    False
Influenza2011    False
Influenza2012    False
Influenza2013    False
Influenza2014    False
Influenza2015    False
Influenza2016    False
Influenza2017    False
Influenza2018    False
Influenza2019    False
dtype: bool

In [13]:
#df_allinf[df_allinf['Influenza2009'].isnull()]

In [14]:
df_allinf = df_allinf.fillna(0)
df_allinf.isnull().any()

Year
Country          False
Influenza2005    False
Influenza2006    False
Influenza2007    False
Influenza2008    False
Influenza2009    False
Influenza2010    False
Influenza2011    False
Influenza2012    False
Influenza2013    False
Influenza2014    False
Influenza2015    False
Influenza2016    False
Influenza2017    False
Influenza2018    False
Influenza2019    False
dtype: bool

In [15]:
df_allinf.dtypes

Year
Country           object
Influenza2005    float64
Influenza2006    float64
Influenza2007    float64
Influenza2008    float64
Influenza2009    float64
Influenza2010    float64
Influenza2011    float64
Influenza2012    float64
Influenza2013    float64
Influenza2014    float64
Influenza2015    float64
Influenza2016    float64
Influenza2017    float64
Influenza2018    float64
Influenza2019    float64
dtype: object

## 3 Check

In [16]:
# get country names from the country df
country_list = df_allinf.Country.unique()

In [17]:
arr = os.listdir()
k = len(arr)
arr_short = []
for i in range(k):
    name = arr[i][:-4]
    arr_short.append(name)

In [18]:
a = set(arr_short)
b = set(country_list)
diff = a.difference(b)
diff

{"Côte d'Ivoire",
 "Dem. People's Republic of Korea",
 'Turks and Caicos Islands',
 'a_combined_data'}

# Part II: Merge with CountryNodes csv
## 1 unify country names

In [19]:
path = "C:/Users/Winnie/Documents/NeilJohnson/Migration/Version_3_March_5/"
df_cn = pd.read_csv(path + 'CountryNodes_All.csv')
df_cn.head(3)

Unnamed: 0,Country,Region,UN_Region
0,Australia,Australia New Zealand,Oceania
1,New Zealand,Australia New Zealand,Oceania
2,Anguilla,Caribbean,Latin America and The Caribbean


In [20]:
# replace names in df_allinf
df_allinf_unified = df_allinf.replace({"United Kingdom of Great Britain and Northern Ireland": "United Kingdom",
                  "Lao People's Democratic Republic": "Lao People's Democratic Republic",
                  "Turks and Caicos is.": "Turks and Caicos Islands",
                  "Democratic People's Republic of Korea": "Dem. People's Republic of Korea"})


In [21]:
# check
df_allinf_unified.iloc[153]

# The following results show that Turks and Caicos Is.'s name has not been changed yet

Year
Country          Turks and Caicos Is.
Influenza2005                       0
Influenza2006                       0
Influenza2007                       0
Influenza2008                       0
Influenza2009                       0
Influenza2010                       0
Influenza2011                       0
Influenza2012                       0
Influenza2013                       0
Influenza2014                       0
Influenza2015                       0
Influenza2016                       0
Influenza2017                       0
Influenza2018                       0
Influenza2019                      12
Name: 153, dtype: object

In [22]:
# change name by hand
df_allinf_unified.at[153, 'Country'] = "Turks and Caicos Islands"
df_allinf_unified.iloc[153]

Year
Country          Turks and Caicos Islands
Influenza2005                           0
Influenza2006                           0
Influenza2007                           0
Influenza2008                           0
Influenza2009                           0
Influenza2010                           0
Influenza2011                           0
Influenza2012                           0
Influenza2013                           0
Influenza2014                           0
Influenza2015                           0
Influenza2016                           0
Influenza2017                           0
Influenza2018                           0
Influenza2019                          12
Name: 153, dtype: object

## 2 Merge 2 dfs

In [23]:
# merge
df_merge = pd.merge(df_cn, df_allinf_unified, how = 'left', 
                    left_on = 'Country', right_on = 'Country').fillna(0)
df_merge.head()

Unnamed: 0,Country,Region,UN_Region,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019
0,Australia,Australia New Zealand,Oceania,712.0,259.0,1355.0,768.0,5107.0,1252.0,2040.0,4798.0,2002.0,3473.0,3622.0,6705.0,10509.0,4264.0,14002.0
1,New Zealand,Australia New Zealand,Oceania,269.0,285.0,207.0,548.0,1542.0,326.0,1218.0,2261.0,2185.0,3418.0,5102.0,294.0,945.0,475.0,957.0
2,Anguilla,Caribbean,Latin America and The Caribbean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0
3,Antigua and Barbuda,Caribbean,Latin America and The Caribbean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,Aruba,Caribbean,Latin America and The Caribbean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,52.0,62.0,166.0


In [24]:
df_merge.head(2)

Unnamed: 0,Country,Region,UN_Region,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019
0,Australia,Australia New Zealand,Oceania,712.0,259.0,1355.0,768.0,5107.0,1252.0,2040.0,4798.0,2002.0,3473.0,3622.0,6705.0,10509.0,4264.0,14002.0
1,New Zealand,Australia New Zealand,Oceania,269.0,285.0,207.0,548.0,1542.0,326.0,1218.0,2261.0,2185.0,3418.0,5102.0,294.0,945.0,475.0,957.0


In [25]:
df_merge.isnull().sum().sum()

0

## 3 Export csv files 

### 3.1 232 countries 

In [26]:
# export csv files for 232 countres
df_merge.to_csv(path + 'CountryNodes_Influenza232.csv', index=False)

In [27]:
df_merge.head(2)

Unnamed: 0,Country,Region,UN_Region,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019
0,Australia,Australia New Zealand,Oceania,712.0,259.0,1355.0,768.0,5107.0,1252.0,2040.0,4798.0,2002.0,3473.0,3622.0,6705.0,10509.0,4264.0,14002.0
1,New Zealand,Australia New Zealand,Oceania,269.0,285.0,207.0,548.0,1542.0,326.0,1218.0,2261.0,2185.0,3418.0,5102.0,294.0,945.0,475.0,957.0


### 3.2 165 countries 

In [28]:
# export csv files for 165 countries
df_merge_165 = pd.merge(df_allinf_unified, df_cn, how = 'left', 
                    left_on = 'Country', right_on = 'Country').fillna(0)
df_merge_165.tail(2)

Unnamed: 0,Country,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019,Region,UN_Region
163,Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,239.0,Western Asia,Northern Africa and Western Asia
164,Zambia,0.0,0.0,0.0,0.0,0.0,61.0,76.0,137.0,115.0,82.0,163.0,166.0,154.0,391.0,426.0,Eastern Africa,Sub_Saharan Arica


In [29]:
len(df_merge_165.Country)

165

In [30]:
# move region and UN_region up to the front
cols = df_merge_165.columns.tolist()
cols = cols[:1] + cols[-2:] + cols[1:-2]
cols

['Country',
 'Region',
 'UN_Region',
 'Influenza2005',
 'Influenza2006',
 'Influenza2007',
 'Influenza2008',
 'Influenza2009',
 'Influenza2010',
 'Influenza2011',
 'Influenza2012',
 'Influenza2013',
 'Influenza2014',
 'Influenza2015',
 'Influenza2016',
 'Influenza2017',
 'Influenza2018',
 'Influenza2019']

In [31]:
df_merge_165 = df_merge_165[cols]
df_merge_165.head(2)

Unnamed: 0,Country,Region,UN_Region,Influenza2005,Influenza2006,Influenza2007,Influenza2008,Influenza2009,Influenza2010,Influenza2011,Influenza2012,Influenza2013,Influenza2014,Influenza2015,Influenza2016,Influenza2017,Influenza2018,Influenza2019
0,Afghanistan,Southern Asia,Central and Southern Asia,0.0,0.0,0.0,0.0,283.0,26.0,5.0,2.0,0.0,0.0,13.0,257.0,108.0,187.0,278.0
1,Albania,Southern Europe,Europe and Northern America,0.0,0.0,0.0,0.0,0.0,10.0,133.0,122.0,97.0,157.0,67.0,314.0,136.0,350.0,527.0


In [32]:
# export csv files for 165 countres
df_merge_165.to_csv(path + 'CountryNodes_Influenza165.csv', index=False)

In [33]:
df_merge_165.isnull().sum().sum()

0