## Extraction and cleanup of the refugee data

In [20]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import re
import unidecode

In [21]:
REFUGEE_DATA_PATH = "unhcr_refugee.csv"
RAW_COLUMN_NAMES = ["year", "country_dest", "origin", "refugee",
                "asylum", "returned_refugee", "internally_displaced", "returned_idp",
                "stateless", "others", "total"
               ]
RAW_COLUMN_TYPE = {"year": int, "coutry_dest" : object, "origin" : object, "refugee" : float,
               "asylum" : float, "returned_refugee" : float, "idp" : float, "returned_idp" : float,
               "stateless" : float, "others" : float, "total" : float
              }
raw_refugee_df = pd.read_csv(REFUGEE_DATA_PATH, skiprows=4, names=RAW_COLUMN_NAMES, dtype=RAW_COLUMN_TYPE, na_values=["*"])
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,,,,,,,9654.0
1,1989,Angola,Namibia,1145.0,,,,,,,1145.0
2,1989,Angola,South Africa,2100.0,,,,,,,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,,,,,,,70.0
4,1989,Argentina,Various/Unknown,12634.0,,1060.0,,,,,13694.0


In [22]:
raw_refugee_df.fillna(value=0, inplace=True)
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,0.0,0.0,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
4,1989,Argentina,Various/Unknown,12634.0,0.0,1060.0,0.0,0.0,0.0,0.0,13694.0


In [23]:
# We drop all the origins that are 'Various/Unknown', we are interested in the country of origins, so
# this identifient is useless to our analysis
raw_refugee_df = raw_refugee_df[(raw_refugee_df.origin != 'Various/Unknown') & (raw_refugee_df.origin != 'Stateless')]

# We also drop the returned columns because it is symptomatic of past refugee and doesn't really fit in our analysis
try:
    raw_refugee_df.drop(['returned_refugee', 'returned_idp'], axis=1, inplace=True)
except: # avoid error if we re-run this code
    pass

display(raw_refugee_df.head(5))

Unnamed: 0,year,country_dest,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,2100.0
7,1989,Burundi,Dem. Rep. of the Congo,59557.0,0.0,0.0,0.0,0.0,59557.0
8,1989,Burundi,Rwanda,207486.0,0.0,0.0,0.0,0.0,207486.0


In [24]:
REFUGEE_COLUMNS = ["year", "origin", "refugee", "asylum", "internally_displaced", "stateless", "others", "total"]
refugee_df = pd.DataFrame(columns=REFUGEE_COLUMNS)

for year in tqdm(raw_refugee_df.year.unique()):
    for origin in raw_refugee_df[raw_refugee_df.year == year].origin.unique():
        index = (raw_refugee_df.year == year) & (raw_refugee_df.origin == origin)
        temp_df_no_dest = raw_refugee_df[index].drop(["country_dest"], axis=1)
        sum_series = temp_df_no_dest.sum(numeric_only=True)
        # drop the row if the column of interest are zero 
        # we don't check total, because it might take into account returne, which we dropped
        if (sum_series[1:5] == 0).all():
            continue
            
        sum_series["year"] = year
        sum_series["origin"] = origin
        sum_series["total"] = sum_series[1:5].sum()
        
        refugee_df = refugee_df.append(sum_series, ignore_index=True)
        
display(refugee_df.head(5))

100%|██████████| 28/28 [00:51<00:00,  1.84s/it]


Unnamed: 0,year,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989.0,Dem. Rep. of the Congo,100786.0,0.0,0.0,0.0,0.0,100786.0
1,1989.0,Namibia,3704.0,0.0,0.0,0.0,0.0,3704.0
2,1989.0,South Africa,17137.0,0.0,0.0,0.0,0.0,17137.0
3,1989.0,Rwanda,319501.0,0.0,0.0,0.0,0.0,319501.0
4,1989.0,Uganda,21358.0,0.0,0.0,0.0,0.0,21358.0


### Find the country id with the gnwo reference used in the conflict_df

In [25]:
def extract_gnwo_countries_to_df():
    """Extract the countries from the gnwo and their id from the gnwo files"""
    countries_list = []
    # First gnwo file, contains id, code, name for all gnow numbers
    with open("gnwo.txt", "r") as gnow:
        for line in gnow:
            split_line = re.split(r'\t+', line)
            countries_list.append(split_line[0:3])
            
    # We add the gnwo2 file, it contains more countries
    with open("gnwo2.txt", "r") as gnow2:
        for line in gnow2:
            val = line.strip().split(";")
            split_line = [val[0], None, val[1]]
            countries_list.append(split_line)
        
    return pd.DataFrame(countries_list, columns=["id", "code" ,"name"])
countries = extract_gnwo_countries_to_df()
display(countries.head(2))
display(countries.tail(2))

Unnamed: 0,id,code,name
0,2,USA,United States of America
1,20,CAN,Canada


Unnamed: 0,id,code,name
825,552,,Zimbabwe
826,552,,Zimbabwe (Rhodesia)


In [26]:
def get_country_id(country_df, country, year, not_indexed=[]):
    """Check if the country name is in the standard countries dataset
       Particular conditions must be respected in the balkans in the years of interest
    """
    country_id=None
    # Force ascii names of country
    country = unidecode.unidecode(country)
    
    # Manage diminutive problem
    country = re.sub(r"Dem\.", "Democratic", country)
    country = re.sub(r"Rep\.", "Republic", country)
    
    # Manage China
    country = re.sub(r".*China.*", "China", country)
    
    # Manage special cases
    # Note: We decided to do this instead of dropping the values
    #       The values will be reported to their "controlling" states
    special_case_dict = {
        "Western Sahara" : 600, # Contested, but set as Morroco
        "Bolivia (Plurinational State of)" : 145, # Bolivia
        "Palestinian" : 666, # Israel
        "Serbia and Kosovo (S/RES/1244 (1999))" :345, # Serbia 
        "Tibetan" : 711, # Tibet
        "Cabo Verde" : 402, # Cape Verde
        "French Guiana" : 220, # France
        "The former Yugoslav Republic of Macedonia" : 343, # Macedonia
        "Puerto Rico" : 2, # USA
        "Martinique" : 220, # France
        "Bermuda" : 200, # United Kingdom
        "Micronesia (Federated States of)" : 987, # Micronesia
        "Cayman Islands" : 200, # United Kingdom
        "Gibraltar" : 200, # United Kingdom
        "Turks and Caicos Islands" : 200, # United Kingdom
        "Niue" : 920, # New Zealand
        "French Polynesia" : 220, # France
        "Holy See (the)" : None, # Vatican
        "New Caledonia" : 220, # France
        "Cook Islands" : 920, # New Zealand
        "Curacao" : 210, # Netherlands
        "British Virgin Islands" : 200, # United Kingdom
        "Guadeloupe" : 220, # France
        "Norfolk Island" : 900, # Australia
        "Wallis and Futuna Islands " : 220, # France
        "Saint-Pierre-et-Miquelon" : 220, # France
        "Svalbard and Jan Mayen" : 385 # Norway
    }
    
    check_result = country_df.name.str.contains(country, regex=False) 
    index = country_df.index[check_result]
    if index.empty:
        #print("Warning: failed to find index of country {}".format(country))
        country_id = special_case_dict[country]
    else:
        country_id = country_df.loc[index[0], "id"]
        
    if not_indexed:    
        return country_id, not_indexed
    else:
        return country_id
        

In [27]:
# Create a Series containing the gnwo index of each country in the refugee list
id_with_index_list = []
for row in refugee_df.itertuples():
    country_id = get_country_id(countries, row.origin, row.year)
    index = row.Index
    id_with_index_list.append([index, country_id])
country_id_df = pd.DataFrame(id_with_index_list, columns=["index", "country_id"])
country_ids = pd.Series(country_id_df.country_id, index=country_id_df.index)
display(country_ids.head(5))

0    490
1    565
2    560
3    517
4    500
Name: country_id, dtype: object

In [31]:
if not "country_id" in refugee_df.columns:
    refugee_df = pd.concat([refugee_df, country_ids], axis=1)
try:
    refugee_df = refugee_df.drop("origin", axis=1)
except:
    pass
display(refugee_df.head(5))

Unnamed: 0,year,refugee,asylum,internally_displaced,stateless,others,total,country_id
0,1989.0,100786.0,0.0,0.0,0.0,0.0,100786.0,490
1,1989.0,3704.0,0.0,0.0,0.0,0.0,3704.0,565
2,1989.0,17137.0,0.0,0.0,0.0,0.0,17137.0,560
3,1989.0,319501.0,0.0,0.0,0.0,0.0,319501.0,517
4,1989.0,21358.0,0.0,0.0,0.0,0.0,21358.0,500


In [29]:
with open('refugee.pickle', 'wb') as out:
    pickle.dump(refugee_df, out)

In [30]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)