## Extraction and cleanup of the refugee data

In [93]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

import re
import unidecode

In [94]:
REFUGEE_DATA_PATH = "unhcr_refugee.csv"
RAW_COLUMN_NAMES = ["year", "country_dest", "origin", "refugee",
                "asylum", "returned_refugee", "internally_displaced", "returned_idp",
                "stateless", "others", "total"
               ]
RAW_COLUMN_TYPE = {"year": int, "coutry_dest" : object, "origin" : object, "refugee" : float,
               "asylum" : float, "returned_refugee" : float, "idp" : float, "returned_idp" : float,
               "stateless" : float, "others" : float, "total" : float
              }
raw_refugee_df = pd.read_csv(REFUGEE_DATA_PATH, skiprows=4, names=RAW_COLUMN_NAMES, dtype=RAW_COLUMN_TYPE, na_values=["*"])
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,,,,,,,9654.0
1,1989,Angola,Namibia,1145.0,,,,,,,1145.0
2,1989,Angola,South Africa,2100.0,,,,,,,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,,,,,,,70.0
4,1989,Argentina,Various/Unknown,12634.0,,1060.0,,,,,13694.0


In [95]:
raw_refugee_df.fillna(value=0, inplace=True)
raw_refugee_df.head(5)

Unnamed: 0,year,country_dest,origin,refugee,asylum,returned_refugee,internally_displaced,returned_idp,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,0.0,0.0,2100.0
3,1989,United Arab Emirates,Various/Unknown,70.0,0.0,0.0,0.0,0.0,0.0,0.0,70.0
4,1989,Argentina,Various/Unknown,12634.0,0.0,1060.0,0.0,0.0,0.0,0.0,13694.0


In [96]:
# We drop all the origins that are 'Various/Unknown', we are interested in the country of origins, so
# this identifient is useless to our analysis
raw_refugee_df = raw_refugee_df[(raw_refugee_df.origin != 'Various/Unknown') & (raw_refugee_df.origin != 'Stateless')]

# We also drop the returned columns because it is symptomatic of past refugee and doesn't really fit in our analysis
try:
    raw_refugee_df.drop(['returned_refugee', 'returned_idp'], axis=1, inplace=True)
except: # avoid error if we re-run this code
    pass

display(raw_refugee_df.head(5))

Unnamed: 0,year,country_dest,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989,Angola,Dem. Rep. of the Congo,9654.0,0.0,0.0,0.0,0.0,9654.0
1,1989,Angola,Namibia,1145.0,0.0,0.0,0.0,0.0,1145.0
2,1989,Angola,South Africa,2100.0,0.0,0.0,0.0,0.0,2100.0
7,1989,Burundi,Dem. Rep. of the Congo,59557.0,0.0,0.0,0.0,0.0,59557.0
8,1989,Burundi,Rwanda,207486.0,0.0,0.0,0.0,0.0,207486.0


In [97]:
REFUGEE_COLUMNS = ["year", "origin", "refugee", "asylum", "internally_displaced", "stateless", "others", "total"]
refugee_df = pd.DataFrame(columns=REFUGEE_COLUMNS)

for year in tqdm(raw_refugee_df.year.unique()):
    for origin in raw_refugee_df[raw_refugee_df.year == year].origin.unique():
        index = (raw_refugee_df.year == year) & (raw_refugee_df.origin == origin)
        temp_df_no_dest = raw_refugee_df[index].drop(["country_dest"], axis=1)
        sum_series = temp_df_no_dest.sum(numeric_only=True)
        # drop the row if the column of interest are zero 
        # we don't check total, because it might take into account returne, which we dropped
        if (sum_series[1:5] == 0).all():
            continue
            
        sum_series["year"] = year
        sum_series["origin"] = origin
        sum_series["total"] = sum_series[1:5].sum()
        
        refugee_df = refugee_df.append(sum_series, ignore_index=True)
        
display(refugee_df.head(5))

100%|██████████| 28/28 [00:52<00:00,  1.87s/it]


Unnamed: 0,year,origin,refugee,asylum,internally_displaced,stateless,others,total
0,1989.0,Dem. Rep. of the Congo,100786.0,0.0,0.0,0.0,0.0,100786.0
1,1989.0,Namibia,3704.0,0.0,0.0,0.0,0.0,3704.0
2,1989.0,South Africa,17137.0,0.0,0.0,0.0,0.0,17137.0
3,1989.0,Rwanda,319501.0,0.0,0.0,0.0,0.0,319501.0
4,1989.0,Uganda,21358.0,0.0,0.0,0.0,0.0,21358.0


### Find the country id with the gnwo reference used in the conflict_df

In [98]:
def extract_gnwo_countries_to_df():
    """Extract the countries from the gnwo and their id from the gnwo files"""
    countries_list = []
    # First gnwo file, contains id, code, name for all gnow numbers
    with open("gnwo.txt", "r") as gnow:
        for line in gnow:
            split_line = re.split(r'\t+', line)
            countries_list.append(split_line[0:3])
            
    # We add the gnwo2 file, it contains more countries
    with open("gnwo2.txt", "r") as gnow2:
        for line in gnow2:
            val = line.strip().split(";")
            split_line = [val[0], None, val[1]]
            countries_list.append(split_line)
        
    return pd.DataFrame(countries_list, columns=["id", "code" ,"name"])
countries = extract_gnwo_countries_to_df()
display(countries.head(2))
display(countries.tail(2))

Unnamed: 0,id,code,name
0,2,USA,United States of America
1,20,CAN,Canada


Unnamed: 0,id,code,name
824,552,,Zimbabwe
825,552,,Zimbabwe (Rhodesia)


In [99]:
def get_country_id(country_df, country, not_indexed=[]):
    """Check if the country name is in the standard countries dataset
       Particular conditions must be respected in the balkans in the years of interest
    """
    country_id=None
    # Force ascii names of country
    country = unidecode.unidecode(country)
    
    # Manage diminutive problem
    country = re.sub(r"Dem\.", "Democratic", country)
    country = re.sub(r"Rep\.", "Republic", country)
    
    # Manage China
    country = re.sub(r".*China.*", "China", country)
    
    # Remove Former
#    country = re.sub(r"Former ", "", country)
    
    # Manage special cases
    # Note: We decided to do this instead of dropping the values
    #       The values will be reported to their "controlling" states
    special_case_dict = {
        "Western Sahara" : 600, # Contested, but set as Morroco
        "Bolivia (Plurinational State of)" : 145, # Bolivia
        "Palestinian" : 666, # Israel
        "Serbia and Kosovo (S/RES/1244 (1999))" :345, # Serbia 
        "Tibetan" : 711, # Tibet
        "Cabo Verde" : 402, # Cape Verde
        "French Guiana" : 220, # France
        "The former Yugoslav Republic of Macedonia" : 343, # Macedonia
        "Puerto Rico" : 2, # USA
        "Martinique" : 220, # France
        "Bermuda" : 200, # United Kingdom
        "Micronesia (Federated States of)" : 987, # Micronesia
        "Cayman Islands" : 200, # United Kingdom
        "Gibraltar" : 200, # United Kingdom
        "Turks and Caicos Islands" : 200, # United Kingdom
        "Niue" : 920, # New Zealand
        "French Polynesia" : 220, # France
        "Holy See (the)" : None, # Vatican
        "New Caledonia" : 220, # France
        "Cook Islands" : 920, # New Zealand
        "Curacao" : 210, # Netherlands
        "British Virgin Islands" : 200, # United Kingdom
        "Guadeloupe" : 220, # France
        "Norfolk Island" : 900, # Australia
        "Wallis and Futuna Islands " : 220, # France
        "Saint-Pierre-et-Miquelon" : 220, # France
        "Svalbard and Jan Mayen" : 385, # Norway
        # Errors with gdp data
#        "Greenland" : 390, # Denmark
#        "Sint Maarten (Dutch part)" : 210, #Netherlands
#        "State of Palestine" : 666, # Israel
#        "United Kingdom of Great Britain and Northern Ireland" : 200, #United Kingdom
#        "United Republic of Tanzania: Mainland" : 510, # Tanzania
#        "United Republic of Tanzania: Zanzibar" : 510, # Tanzania
#        "Yemen: Democratic Yemen" : 678, # Yemen
#        "Yemen: Yemen Arab Republic" : 678 # Yemen
    }
    
    check_result = country_df.name.str.contains(country, regex=False) 
    index = country_df.index[check_result]
    if index.empty:
        #print("Warning: failed to find index of country {}".format(country))
        country_id = special_case_dict[country]
    else:
        country_id = country_df.loc[index[0], "id"]
        
    if not_indexed:    
        return country_id, not_indexed
    else:
        return country_id
        

In [100]:
# Create a Series containing the gnwo index of each country in the refugee list
id_with_index_list = []
for row in tqdm(refugee_df.itertuples()):
    country_id = get_country_id(countries, row.origin)
    index = row.Index
    id_with_index_list.append([index, country_id])
country_id_df = pd.DataFrame(id_with_index_list, columns=["index", "country_id"])
country_ids = pd.Series(country_id_df.country_id, index=country_id_df.index)
display(country_ids.head(5))

5043it [00:01, 2853.99it/s]


0    490
1    565
2    560
3    517
4    500
Name: country_id, dtype: object

In [101]:
if not "country_id" in refugee_df.columns:
    refugee_df = pd.concat([refugee_df, country_ids], axis=1)
try:
    refugee_df = refugee_df.drop("origin", axis=1)
except:
    pass
display(refugee_df.head(5))

Unnamed: 0,year,refugee,asylum,internally_displaced,stateless,others,total,country_id
0,1989.0,100786.0,0.0,0.0,0.0,0.0,100786.0,490
1,1989.0,3704.0,0.0,0.0,0.0,0.0,3704.0,565
2,1989.0,17137.0,0.0,0.0,0.0,0.0,17137.0,560
3,1989.0,319501.0,0.0,0.0,0.0,0.0,319501.0,517
4,1989.0,21358.0,0.0,0.0,0.0,0.0,21358.0,500


In [102]:
refugee_df["country_id"] = pd.to_numeric(refugee_df["country_id"])

In [103]:
year_id_dup = refugee_df.duplicated(subset=["year", "country_id"])
not_dup_refugee_df = refugee_df.loc[~year_id_dup]
dup_refugee_df = refugee_df.loc[year_id_dup]

In [104]:
# We want to merge rows with duplicated year and country_id, so we have 1 value per country per year
for row in tqdm(dup_refugee_df.itertuples()):
    values = row[1:]
    not_dup_loc = (not_dup_refugee_df["year"] == row.year) & (not_dup_refugee_df["country_id"] == row.country_id)
    # Add values
    not_dup_refugee_df.loc[not_dup_loc] = not_dup_refugee_df.loc[not_dup_loc].add(values)
    # Year and country_id where added, set the proper values
    not_dup_refugee_df.loc[not_dup_loc, "year"] = row.year
    not_dup_refugee_df.loc[not_dup_loc, "country_id"] = row.country_id

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._setitem_with_indexer(indexer, value)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
294it [00:54,  5.42it/s]


In [105]:
not_dup_refugee_df.duplicated(subset=["year", "country_id"]).any()

False

In [106]:
refugee_df = not_dup_refugee_df

In [107]:
with open('refugee.pickle', 'wb') as out:
    pickle.dump(refugee_df, out)

In [108]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)

### Add gdp to refugee df

In [123]:
gdp_df = pd.read_csv("UNdata_gdp.csv", usecols=[0,1,3])
gdp_df[["country", "year", "gdp"]] = gdp_df[["Country or Area", "Year", "Value"]]
gdp_df = gdp_df.drop(["Country or Area", "Year", "Value"], axis=1)

In [124]:
def get_country_id_gdp(country_df, country, not_indexed=[]):
    """Check if the country name is in the standard countries dataset
       Particular conditions must be respected in the balkans in the years of interest
    """
    country_id=None
    # Force ascii names of country
    country = unidecode.unidecode(country)
    
    # Manage diminutive problem
    country = re.sub(r"Dem\.", "Democratic", country)
    country = re.sub(r"Rep\.", "Republic", country)
    
    # Manage China
    country = re.sub(r".*China.*", "China", country)


    check_result = country_df.name.str.contains(country, regex=False) 
    index = country_df.index[check_result]
    if index.empty:
        #print("Warning: failed to find index of country {}".format(country))
        # We do no management of None values on gdp
        country_id = None
    else:
        country_id = country_df.loc[index[0], "id"]
        
    if not_indexed:    
        return country_id, not_indexed
    else:
        return country_id
        

In [125]:
display(gdp_df.head(5))

Unnamed: 0,country,year,gdp
0,Afghanistan,2015,623.184798
1,Afghanistan,2014,667.88342
2,Afghanistan,2013,704.322178
3,Afghanistan,2012,717.563696
4,Afghanistan,2011,665.429433


In [126]:
country_id = []
for row in gdp_df.itertuples():
    try:
        country_id.append(get_country_id_gdp(countries, row.country))
    except:
        country_id.append(None)
country_id = pd.Series(country_id, name="country_id")

In [127]:
if not "country_id" in gdp_df.columns:
    gdp_df = pd.concat([gdp_df, country_id], axis=1)
    gdp_df = gdp_df.drop("country", axis=1)
gdp_df.dropna(inplace=True)
display(gdp_df)

Unnamed: 0,year,gdp,country_id
0,2015,623.184798,700
1,2014,667.883420,700
2,2013,704.322178,700
3,2012,717.563696,700
4,2011,665.429433,700
5,2010,574.987555,700
6,2009,462.104004,700
7,2008,405.917513,700
8,2007,401.126368,700
9,2006,298.378993,700


In [128]:
gdp_df["year"] = pd.to_numeric(gdp_df["year"])
gdp_df["country_id"] = pd.to_numeric(gdp_df["country_id"])

In [129]:
gdp_df.set_index(["year", "country_id"]).sort_index()

Unnamed: 0_level_0,Unnamed: 1_level_0,gdp
year,country_id,Unnamed: 2_level_1
1989,2.0,22599.992143
1989,20.0,20700.799311
1989,31.0,14213.543372
1989,40.0,2577.207019
1989,41.0,393.347528
1989,42.0,2604.425557
1989,42.0,1188.182910
1989,51.0,1945.132761
1989,52.0,3562.104294
1989,53.0,7812.567309


In [130]:
# Drop duplicated gdp
gdp_df.drop_duplicates(["year", "country_id"], inplace=True)
print(gdp_df.duplicated(["year", "country_id"]).sum())

0


In [131]:
with open('gdp.pickle', 'wb') as out:
    pickle.dump(gdp_df, out)