In [172]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

## In this notebook, we'll guarantee that all the countries are uniform between 2015-2020 dataframes.
- This is a preprocessing notebook where all the data will be modified such that there is a 1-1 mapping between countries of each dataframe.
- Use 2020 as our standard because want to predict this year
- Remove countries from 2015-2019 that are NOT in 2020
- [Drop data](https://www.shanelynn.ie/pandas-drop-delete-dataframe-rows-columns/)

In [173]:
with open('dataframes/df2015', 'rb') as f_2015:
    df2015 = pickle.load(f_2015)
f_2015.close()

with open('dataframes/df2016', 'rb') as f_2016:
    df2016 = pickle.load(f_2016)
f_2016.close()

with open('dataframes/df2017', 'rb') as f_2017:
    df2017 = pickle.load(f_2017)
f_2017.close()

with open('dataframes/df2018', 'rb') as f_2018:
    df2018 = pickle.load(f_2018)
f_2018.close()

with open('dataframes/df2019', 'rb') as f_2019:
    df2019 = pickle.load(f_2019)
f_2019.close()

with open('dataframes/df2020', 'rb') as f_2020:
    df2020 = pickle.load(f_2020)
f_2020.close()

with open('dataframes/df_all', 'rb') as f_all:
    df_all = pickle.load(f_all)
f_all.close()

with open('dataframes/df_feats', 'rb') as f_feats:
    df_feats = pickle.load(f_feats)
f_feats.close()

In [174]:
# c_2020 = df2020['Country or region'].tolist()
# c_2019 = df2019['Country or region'].tolist()
# c_2018 = df2018['Country or region'].tolist()
# c_2017 = df2017['Country or region'].tolist()
# c_2016 = df2016['Country or region'].tolist()
# c_2015 = df2015['Country or region'].tolist()

## Remove countries not found in 2020 
- Not considering countries that are spelled differently across df's (United States and Hong Kong)
- Start from 2020, move back towards 2015 and remove countries not found in next year's dataframe.
- Once all the countries have been removed from previous years, we will do the same starting from 2015 towards 2020.
- This will remove countries in 2016..2020 based on the years before.
- After this, we will have the intersection of all countries across 5 years.

In [175]:
# check 2019
c_2020 = df2020['Country or region'].tolist()
c_2019 = df2019['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2019)):
    if (c_2019[i] not in c_2020):
        drop_index.append(i)
        print(c_2019[i])
drop_index

United States
Taiwan
Qatar
Trinidad & Tobago
Northern Cyprus
Hong Kong
North Macedonia
Bhutan
Somalia
Syria


[18, 24, 28, 38, 63, 75, 83, 94, 111, 148]

In [176]:
df2019.drop(df2019.index[drop_index], inplace=True)

In [177]:
# check 2018
c_2019 = df2019['Country or region'].tolist()
c_2018 = df2018['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2018)):
    if (c_2018[i] not in c_2019):
        drop_index.append(i)
        print(c_2018[i])
drop_index

United States
Taiwan
Qatar
Trinidad & Tobago
Belize
Northern Cyprus
Hong Kong
Macedonia
Bhutan
Somalia
Sudan
Angola
Syria


[17, 25, 31, 37, 48, 57, 75, 88, 96, 97, 136, 141, 149]

In [178]:
df2018.drop(df2018.index[drop_index], inplace=True)

In [179]:
# check 2017
c_2018 = df2018['Country or region'].tolist()
c_2017 = df2017['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2017)):
    if (c_2017[i] not in c_2018):
        drop_index.append(i)
        print(c_2017[i])
drop_index

United States
Taiwan Province of China
Qatar
Trinidad and Tobago
Belize
North Cyprus
Hong Kong S.A.R., China
Macedonia
Somalia
Bhutan
Sudan
Angola
Syria


[13, 32, 34, 37, 49, 60, 70, 91, 92, 96, 129, 139, 151]

In [180]:
df2017.drop(df2017.index[drop_index], inplace=True)

In [181]:
# check 2016
c_2017 = df2017['Country or region'].tolist()
c_2016 = df2016['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2016)):
    if (c_2016[i] not in c_2017):
        drop_index.append(i)
        print(c_2016[i])
drop_index

United States
Puerto Rico
Taiwan
Qatar
Suriname
Trinidad and Tobago
Belize
North Cyprus
Hong Kong
Somalia
Bhutan
Macedonia
Somaliland Region
Laos
Sudan
Comoros
Angola
Syria


[12, 14, 34, 35, 39, 42, 51, 61, 74, 75, 83, 94, 96, 101, 132, 137, 140, 155]

In [182]:
df2016.drop(df2016.index[drop_index], inplace=True)

In [183]:
# check 2015
c_2016 = df2016['Country or region'].tolist()
c_2015 = df2015['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2015)):
    if (c_2015[i] not in c_2016):
        drop_index.append(i)
        print(c_2015[i])
drop_index

United States
Oman
Qatar
Taiwan
Suriname
Trinidad and Tobago
North Cyprus
Hong Kong
Bhutan
Somaliland region
Macedonia
Mozambique
Lesotho
Laos
Swaziland
Sudan
Djibouti
Angola
Comoros
Central African Republic
Syria


[14,
 21,
 27,
 37,
 39,
 40,
 65,
 71,
 78,
 90,
 92,
 93,
 96,
 98,
 100,
 117,
 125,
 136,
 139,
 147,
 155]

In [184]:
df2015.drop(df2015.index[drop_index], inplace=True)

In [185]:
print(df2020.shape)
print(df2019.shape)
print(df2018.shape)
print(df2017.shape)
print(df2016.shape)
print(df2015.shape)

(153, 10)
(146, 10)
(143, 10)
(142, 10)
(139, 10)
(137, 10)


In [186]:
# check 2016
c_2016 = df2016['Country or region'].tolist()
c_2015 = df2015['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2016)):
    if (c_2016[i] not in c_2015):
        drop_index.append(i)
drop_index

[98, 125]

In [187]:
df2016.drop(df2016.index[drop_index], inplace=True)

In [188]:
# check 2017
c_2017 = df2017['Country or region'].tolist()
c_2016 = df2016['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2017)):
    if (c_2017[i] not in c_2016):
        drop_index.append(i)
        print(c_2017[i])
drop_index

Namibia
Mozambique
Lesotho
South Sudan
Central African Republic


[100, 102, 127, 134, 141]

In [189]:
df2017.drop(df2017.index[drop_index], inplace=True)

In [190]:
# check 2018
c_2018 = df2018['Country or region'].tolist()
c_2017 = df2017['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2018)):
    if (c_2018[i] not in c_2017):
        drop_index.append(i)
        print(c_2018[i])
drop_index

Laos
Namibia
Mozambique
Lesotho
South Sudan
Central African Republic


[99, 108, 112, 129, 140, 141]

In [191]:
df2018.drop(df2018.index[drop_index], inplace=True)

In [192]:
# check 2019
c_2019 = df2019['Country or region'].tolist()
c_2018 = df2018['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2019)):
    if (c_2019[i] not in c_2018):
        drop_index.append(i)
        print(c_2019[i])
drop_index

Laos
Namibia
Gambia
Mozambique
Swaziland
Comoros
Lesotho
Central African Republic
South Sudan


[96, 103, 110, 113, 125, 132, 134, 144, 145]

In [193]:
df2019.drop(df2019.index[drop_index], inplace=True)

In [194]:
# check 2020
c_2020 = df2020['Country or region'].tolist()
c_2019 = df2019['Country or region'].tolist()
drop_index = [] # list of rows to drop by index 
for i in range(len(c_2020)):
    if (c_2020[i] not in c_2019):
        drop_index.append(i)
        print(c_2020[i])
drop_index

United States of America
Taiwan Province of China
Trinidad and Tobago
North Cyprus
Hong Kong S.A.R. of China
Maldives
Macedonia
Laos
Gambia
Mozambique
Namibia
Swaziland
Comoros
Lesotho
Central African Republic
South Sudan


[17, 24, 41, 75, 77, 86, 89, 103, 112, 119, 121, 131, 133, 142, 148, 151]

In [195]:
df2020.drop(df2020.index[drop_index], inplace=True)

In [196]:
print(df2020.shape)
print(df2019.shape)
print(df2018.shape)
print(df2017.shape)
print(df2016.shape)
print(df2015.shape)

(137, 10)
(137, 10)
(137, 10)
(137, 10)
(137, 10)
(137, 10)


In [201]:
file = open('dataframes/df2015', 'wb')
pickle.dump(df2015, file)
file.close()

In [202]:
file = open('dataframes/df2016', 'wb')
pickle.dump(df2016, file)
file.close()

In [203]:
file = open('dataframes/df2017', 'wb')
pickle.dump(df2017, file)
file.close()

In [204]:
file = open('dataframes/df2018', 'wb')
pickle.dump(df2018, file)
file.close()

In [205]:
file = open('dataframes/df2019', 'wb')
pickle.dump(df2019, file)
file.close()

In [206]:
file = open('dataframes/df2020', 'wb')
pickle.dump(df2020, file)
file.close()