In [1]:
import os

import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import re
import unidecode

from IPython.display import display

# Prepare the Refugee dateset

## Extraction and cleanup of the refugee data

### Import the data, remove unused fields and reorganize

First we import the dataset downloaded from [the UNHCR website](http://www.unhcr.org/) making sure the different fields have the appropriate type:


In [2]:
REFUGEE_DATA_PATH = os.path.join("data", "unhcr_refugee.csv")
RAW_COLUMN_NAMES = ["year", "country_dest", "origin", "refugee",
                "asylum", "returned_refugee", "internally_displaced", "returned_idp",
                "stateless", "others", "total"
               ]
RAW_COLUMN_TYPE = {"year": int, "coutry_dest" : object, "origin" : object, "refugee" : float,
               "asylum" : float, "returned_refugee" : float, "idp" : float, "returned_idp" : float,
               "stateless" : float, "others" : float, "total" : float
              }
raw_refugee_df = pd.read_csv(REFUGEE_DATA_PATH,
                             skiprows=4,
                             names=RAW_COLUMN_NAMES,
                             dtype=RAW_COLUMN_TYPE,
                             na_values=["*"])
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,,,,,,,9654.0
1,1989,Angola,Namibia,1145.0,,,,,,,1145.0
2,1989,Angola,South Africa,2100.0,,,,,,,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,,,,,,,70.0
4,1989,Argentina,Various/Unknown,12634.0,,1060.0,,,,,13694.0


We replace the missing values with 0

In [3]:
raw_refugee_df.fillna(value=0, inplace=True)
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,0.0,0.0,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
4,1989,Argentina,Various/Unknown,12634.0,0.0,1060.0,0.0,0.0,0.0,0.0,13694.0


### Group by year and origin of migration

We drop all the origins that are `Various/Unknown`: we are interested in the countries of origin of the migrants, so this identifient is useless to our analysis.
We also drop the returned columns because it is symptomatic of past refugee and doesn't really fit in our analysis

In [4]:
raw_refugee_df = raw_refugee_df[(raw_refugee_df.origin != 'Various/Unknown') & (raw_refugee_df.origin != 'Stateless')]

try:
    raw_refugee_df.drop(['returned_refugee', 'returned_idp'], axis=1, inplace=True)
except: # avoid error if we re-run this code
    pass

raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,2100.0
7,1989,Burundi,Dem. Rep. of the Congo,59557.0,0.0,0.0,0.0,0.0,59557.0
8,1989,Burundi,Rwanda,207486.0,0.0,0.0,0.0,0.0,207486.0


We reorganize the data to have values grouped by year and country of origin. We are only interested in the country of origin as we want to study the correlation between the conflict and the outgoing flows of population

In [5]:
REFUGEE_COLUMNS = ["year", "origin", "refugee", "asylum", "internally_displaced", "stateless", "others", "total"]
refugee_df = pd.DataFrame(columns=REFUGEE_COLUMNS)

for year in tqdm(raw_refugee_df.year.unique()):
    for origin in raw_refugee_df[raw_refugee_df.year == year].origin.unique():
        index = (raw_refugee_df.year == year) & (raw_refugee_df.origin == origin)
        temp_df_no_dest = raw_refugee_df[index].drop(["country_dest"], axis=1)
        sum_series = temp_df_no_dest.sum(numeric_only=True)
        # drop the row if the column of interest are zero 
        # we don't check total, because it might take into account returne, which we dropped
        if (sum_series[1:5] == 0).all():
            continue
            
        sum_series["year"] = year
        sum_series["origin"] = origin
        sum_series["total"] = sum_series[1:5].sum()
        
        refugee_df = refugee_df.append(sum_series, ignore_index=True)
        
refugee_df.head(5)

100%|██████████| 28/28 [01:24<00:00,  3.00s/it]


Unnamed: 0,year,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989.0,Dem. Rep. of the Congo,100786.0,0.0,0.0,0.0,0.0,100786.0
1,1989.0,Namibia,3704.0,0.0,0.0,0.0,0.0,3704.0
2,1989.0,South Africa,17137.0,0.0,0.0,0.0,0.0,17137.0
3,1989.0,Rwanda,319501.0,0.0,0.0,0.0,0.0,319501.0
4,1989.0,Uganda,21358.0,0.0,0.0,0.0,0.0,21358.0


## Merge the Refugee Dataset with the Conflicts dataset

The GWNO in the conflict dataset corresponds to the the *Gleditsch and Ward* number which identifies uniquely every independent state. We download two datafiles associating this number of the country name. Here we import them, clean them and store them into a dataframe to be able to merge later:

In [6]:
def extract_gnwo_countries_to_df():
    """Extract the countries from the gnwo and their id from the gnwo files"""
    countries_list = []
    # First gnwo file, contains id, code, name for all gnow numbers
    with open(os.path.join("data", "gnwo.txt"), "r") as gnow:
        for line in gnow:
            split_line = re.split(r'\t+', line)
            countries_list.append(split_line[0:3])
            
    # We add the gnwo2 file, it contains more countries
    with open(os.path.join("data", "gnwo2.txt"), "r") as gnow2:
        for line in gnow2:
            val = line.strip().split(";")
            split_line = [val[0], None, val[1]]
            countries_list.append(split_line)
        
    return pd.DataFrame(countries_list, columns=["id", "code" ,"name"])
countries = extract_gnwo_countries_to_df()
display(countries.head(2))
display(countries.tail(2))

Unnamed: 0,id,code,name
0,2,USA,United States of America
1,20,CAN,Canada


Unnamed: 0,id,code,name
829,552,,Zimbabwe
830,552,,Zimbabwe (Rhodesia)


In [7]:
def get_country_id(country_df, country, use_special_case_dict=True):
    """Check if the country name is in the standard countries dataset
       Particular conditions must be respected in the balkans in the years of interest.
       Returns the gwn corresponding to the provided country name
    """
    country_id=None
    # Force ascii names of country
    country = unidecode.unidecode(country)
    
    # Strip leading/trailing whitespaces
    country = country.strip()
    
    # Manage diminutive problem
    country = re.sub(r"Dem\.", "Democratic", country)
    country = re.sub(r"Rep\.", "Republic", country)
    
    # Manage China
    country = re.sub(r".*China.*", "China", country)
    
    # Manage special cases
    # Note: We decided to do this instead of dropping the values
    #       The values will be reported to their "controlling" states
    special_case_dict = {
        "Western Sahara" : 600, # Contested, but set as Morroco
        "Bolivia (Plurinational State of)" : 145, # Bolivia
        "Palestinian" : 666, # Israel
        "Serbia and Kosovo (S/RES/1244 (1999))" :345, # Serbia 
        "Tibetan" : 711, # Tibet
        "Cabo Verde" : 402, # Cape Verde
        "French Guiana" : 220, # France
        "The former Yugoslav Republic of Macedonia" : 343, # Macedonia
        "Puerto Rico" : 2, # USA
        "Martinique" : 220, # France
        "Bermuda" : 200, # United Kingdom
        "Micronesia (Federated States of)" : 987, # Micronesia
        "Cayman Islands" : 200, # United Kingdom
        "Gibraltar" : 200, # United Kingdom
        "Turks and Caicos Islands" : 200, # United Kingdom
        "Niue" : 920, # New Zealand
        "French Polynesia" : 220, # France
        "Holy See (the)" : None, # Vatican
        "New Caledonia" : 220, # France
        "Cook Islands" : 920, # New Zealand
        "Curacao" : 210, # Netherlands
        "British Virgin Islands" : 200, # United Kingdom
        "Guadeloupe" : 220, # France
        "Norfolk Island" : 900, # Australia
        "Wallis and Futuna Islands " : 220, # France
        "Saint-Pierre-et-Miquelon" : 220, # France
        "Svalbard and Jan Mayen" : 385, # Norway
    }
    
    check_result = country_df.name.str.contains(country, regex=False) 
    index = country_df.index[check_result]
    if index.empty:
        if use_special_case_dict:
            try:
                country_id = special_case_dict[country]
            except KeyError:
                print("Warning: failed to find index of country {}".format(country))
                country_id = None
        else:
            print("Warning: failed to find index of country {}".format(country))
    else:
        country_id = country_df.loc[index[0], "id"]
        
    return country_id

We create a Series containing the gwn of each country in the refugee dataset:

In [8]:
id_with_index_list = []
for row in tqdm(refugee_df.itertuples()):
    country_id = get_country_id(countries, row.origin)
    index = row.Index
    id_with_index_list.append([index, country_id])
country_id_df = pd.DataFrame(id_with_index_list, columns=["index", "country_id"])
country_ids = pd.Series(country_id_df.country_id, index=country_id_df.index)
display(country_ids.head(5))

4871it [00:02, 1659.06it/s]



5043it [00:03, 1617.62it/s]


0    490
1    565
2    560
3    517
4    500
Name: country_id, dtype: object

In [9]:
if not "country_id" in refugee_df.columns:
    refugee_df = pd.concat([refugee_df, country_ids], axis=1)
try:
    refugee_df = refugee_df.drop("origin", axis=1)
except:
    pass

In [10]:
refugee_df["country_id"] = pd.to_numeric(refugee_df["country_id"])
display(refugee_df.head(5))

Unnamed: 0,year,refugee,asylum,internally_displaced,stateless,others,total,country_id
0,1989.0,100786.0,0.0,0.0,0.0,0.0,100786.0,490.0
1,1989.0,3704.0,0.0,0.0,0.0,0.0,3704.0,565.0
2,1989.0,17137.0,0.0,0.0,0.0,0.0,17137.0,560.0
3,1989.0,319501.0,0.0,0.0,0.0,0.0,319501.0,517.0
4,1989.0,21358.0,0.0,0.0,0.0,0.0,21358.0,500.0


We want to merge rows with duplicated `year` and `country_id`, so we have one value per tuple `(country, year)`

In [11]:
# Identify the duplicates
year_id_dup = refugee_df.duplicated(subset=["year", "country_id"])
not_dup_refugee_df = refugee_df.loc[~year_id_dup]
dup_refugee_df = refugee_df.loc[year_id_dup]

In [12]:
refugee_df = dup_refugee_df.copy()

In [13]:
for row in tqdm(dup_refugee_df.itertuples()):
    values = row[1:]
    not_dup_loc = (not_dup_refugee_df["year"] == row.year) & (not_dup_refugee_df["country_id"] == row.country_id)
    # Add values
    not_dup_refugee_df.loc[not_dup_loc] = not_dup_refugee_df.loc[not_dup_loc].add(values)
    # Year and country_id where added, set the proper values
    not_dup_refugee_df.loc[not_dup_loc, "year"] = row.year
    not_dup_refugee_df.loc[not_dup_loc, "country_id"] = row.country_id

294it [00:01, 290.45it/s]


In [14]:
not_dup_refugee_df.duplicated(subset=["year", "country_id"]).any()

False

In [15]:
refugee_df = not_dup_refugee_df

In [None]:
with open(os.path.join('pickle', 'refugee.pickle'), 'wb') as out:
    pickle.dump(refugee_df, out)

In [None]:
with open(os.path.join('pickle', 'refugee.pickle'), 'rb') as data_source:
    refugee_df = pickle.load(data_source)

# Prepare the GDP Dataset 

We want to combine the gross domestic product with the conflict dataset to analyze whether there is a correlation between the economic situation of a country and its involvement in armed conflicts. We obtained the GDP data from [UNdata](http://data.un.org)

In [18]:
gdp_df = pd.read_csv(os.path.join("data","UNdata_gdp.csv"), usecols=[0,1,3])
gdp_df[["country", "year", "gdp"]] = gdp_df[["Country or Area", "Year", "Value"]]
gdp_df = gdp_df.drop(["Country or Area", "Year", "Value"], axis=1)

In [19]:
def get_country_id_gdp(country_df, country, not_indexed=[]):
    """Check if the country name is in the standard countries dataset
       Particular conditions must be respected in the balkans in the years of interest
    """
    country_id=None
    # Force ascii names of country
    country = unidecode.unidecode(country)
    
    # Strip leading/trailing whitespaces
    country = country.strip()
    
    # Manage diminutive problem
    country = re.sub(r"Dem\.", "Democratic", country)
    country = re.sub(r"Rep\.", "Republic", country)
    
    # Manage China
    country = re.sub(r".*China.*", "China", country)

    check_result = country_df.name.str.contains(country, regex=False) 
    index = country_df.index[check_result]
    if index.empty:
        print("Warning: failed to find index of country '{}'".format(country))
        # We do no management of None values on gdp
        country_id = None
    else:
        country_id = country_df.loc[index[0], "id"]
        
    if not_indexed:    
        return country_id, not_indexed
    else:
        return country_id
        

In [20]:
display(gdp_df.head(5))

Unnamed: 0,country,year,gdp
0,Afghanistan,2015,623.184798
1,Afghanistan,2014,667.88342
2,Afghanistan,2013,704.322178
3,Afghanistan,2012,717.563696
4,Afghanistan,2011,665.429433


In [21]:
country_id = []
for row in gdp_df.itertuples():
    try:
        country_id.append(get_country_id(countries, row.country, use_special_case_dict=False))
    except:
        country_id.append(None)
country_id = pd.Series(country_id, name="country_id")







In [26]:
if not "country_id" in gdp_df.columns:
    gdp_df = pd.concat([gdp_df, country_id], axis=1)
    gdp_df = gdp_df.drop("country", axis=1)
gdp_df.dropna(inplace=True)
display(gdp_df.head(5))

Unnamed: 0,year,gdp,country_id
0,2015,623.184798,700.0
1,2014,667.88342,700.0
2,2013,704.322178,700.0
3,2012,717.563696,700.0
4,2011,665.429433,700.0


In [27]:
gdp_df["year"] = pd.to_numeric(gdp_df["year"])
gdp_df["country_id"] = pd.to_numeric(gdp_df["country_id"])
gdp_df = gdp_df.dropna(axis=0, how="any")
display(gdp_df.head(5))

Unnamed: 0,year,gdp,country_id
0,2015,623.184798,700.0
1,2014,667.88342,700.0
2,2013,704.322178,700.0
3,2012,717.563696,700.0
4,2011,665.429433,700.0


In [28]:
# Drop duplicated gdp
gdp_df.drop_duplicates(["year", "country_id"], inplace=True)
print(gdp_df.duplicated(["year", "country_id"]).sum())

0


In [29]:
gdp_df = gdp_df.set_index(["year", "country_id"]).sort_index()

In [30]:
with open(os.path.join('pickle','gdp.pickle'), 'wb') as out:
    pickle.dump(gdp_df, out)

# Prepare the HDI Dataset

In addition to the GDP of country, we would like to use the [Human Development Index](http://hdr.undp.org/en/content/human-development-index-hdi) to perform a similar analysis.

In [31]:
hdi_df = pd.read_csv(os.path.join("data","hdi_undp.csv"), skiprows=[0])

In [32]:
display(hdi_df.head(5))

Unnamed: 0,HDI Rank,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015
0,169,Afghanistan,0.295,0.3,0.309,0.305,0.3,0.324,0.328,0.332,...,0.415,0.433,0.434,0.448,0.454,0.463,0.47,0.476,0.479,0.479
1,75,Albania,0.635,0.618,0.603,0.608,0.616,0.628,0.637,0.636,...,0.703,0.713,0.721,0.725,0.738,0.752,0.759,0.761,0.762,0.764
2,83,Algeria,0.577,0.581,0.587,0.591,0.595,0.6,0.609,0.617,...,0.69,0.697,0.705,0.714,0.724,0.732,0.737,0.741,0.743,0.745
3,32,Andorra,,,,,,,,,...,,,,,0.819,0.819,0.843,0.85,0.857,0.858
4,150,Angola,,,,,,,,,...,0.454,0.468,0.48,0.488,0.495,0.508,0.523,0.527,0.531,0.533


In [33]:
countries = extract_gnwo_countries_to_df()
country_id = []
for row in hdi_df.itertuples():
    local_country_id = get_country_id(countries, row.Country, False)
    country_id.append(local_country_id)
hdi_country_id = pd.Series(country_id, name="country_id")



In [34]:
if not "country_id" in hdi_df.columns:
    hdi_df = pd.concat([hdi_df, hdi_country_id], axis=1)
display(hdi_df.head(1))

Unnamed: 0,HDI Rank,Country,1990,1991,1992,1993,1994,1995,1996,1997,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,country_id
0,169,Afghanistan,0.295,0.3,0.309,0.305,0.3,0.324,0.328,0.332,...,0.433,0.434,0.448,0.454,0.463,0.47,0.476,0.479,0.479,700


In [35]:
if "Country" in hdi_df.columns:
    hdi_df = hdi_df.drop(["HDI Rank", "Country"], axis=1)
display(hdi_df.head(1))

Unnamed: 0,1990,1991,1992,1993,1994,1995,1996,1997,1998,1999,...,2007,2008,2009,2010,2011,2012,2013,2014,2015,country_id
0,0.295,0.3,0.309,0.305,0.3,0.324,0.328,0.332,0.335,0.338,...,0.433,0.434,0.448,0.454,0.463,0.47,0.476,0.479,0.479,700


In [36]:
col_dict_mapper= {}
for col in hdi_df.columns:
    try:
        int(col)
    except:
        pass
    else:
        col_dict_mapper[col] = "y{}".format(col)
hdi_df = hdi_df.rename(index=str, columns=col_dict_mapper)
display(hdi_df.head(1))

Unnamed: 0,y1990,y1991,y1992,y1993,y1994,y1995,y1996,y1997,y1998,y1999,...,y2007,y2008,y2009,y2010,y2011,y2012,y2013,y2014,y2015,country_id
0,0.295,0.3,0.309,0.305,0.3,0.324,0.328,0.332,0.335,0.338,...,0.433,0.434,0.448,0.454,0.463,0.47,0.476,0.479,0.479,700


In [37]:
new_hdi_df = pd.DataFrame([], columns=["year", "hdi", "country_id"])
for row in tqdm(hdi_df.itertuples()):
    for year in range(1990,2016):
        local_series = pd.Series([year, row[year-1989], row.country_id],
                                 index=["year", "hdi", "country_id"])
        new_hdi_df = new_hdi_df.append(local_series, ignore_index=True)
display(new_hdi_df.head(5))

188it [00:12, 15.43it/s]


Unnamed: 0,year,hdi,country_id
0,1990,0.295,700
1,1991,0.3,700
2,1992,0.309,700
3,1993,0.305,700
4,1994,0.3,700


In [38]:
new_hdi_df = new_hdi_df.dropna()

In [39]:
# Make sure everything is numeric
new_hdi_df.year =  pd.to_numeric(new_hdi_df.year)
new_hdi_df.hdi =  pd.to_numeric(new_hdi_df.hdi)
new_hdi_df.country_id =  pd.to_numeric(new_hdi_df.country_id)

In [41]:
new_hdi_df.head(5)

Unnamed: 0,year,hdi,country_id
0,1990,0.295,700
1,1991,0.3,700
2,1992,0.309,700
3,1993,0.305,700
4,1994,0.3,700


In [42]:
with open(os.path.join('pickle','hdi.pickle'), 'wb') as out:
    pickle.dump(new_hdi_df, out)