# Extraction and cleanup of the refugee data

In [None]:
import numpy as np
import pandas as pd
import pickle
from tqdm import tqdm

In [None]:
REFUGEE_DATA_PATH = "unhcr_refugee.csv"
RAW_COLUMN_NAMES = ["year", "country_dest", "origin", "refugee",
                "asylum", "returned_refugee", "internally_displaced", "returned_idp",
                "stateless", "others", "total"
               ]
RAW_COLUMN_TYPE = {"year": int, "coutry_dest" : object, "origin" : object, "refugee" : float,
               "asylum" : float, "returned_refugee" : float, "idp" : float, "returned_idp" : float,
               "stateless" : float, "others" : float, "total" : float
              }
raw_refugee_df = pd.read_csv(REFUGEE_DATA_PATH, skiprows=4, names=RAW_COLUMN_NAMES, dtype=RAW_COLUMN_TYPE, na_values=["*"])
raw_refugee_df.head(5)

In [None]:
raw_refugee_df.fillna(value=0, inplace=True)
raw_refugee_df.head(5)

In [None]:
# We drop all the origins that are 'Various/Unknown', we are interested in the country of origins, so
# this identifient is useless to our analysis
raw_refugee_df = raw_refugee_df[(raw_refugee_df.origin != 'Various/Unknown') & (raw_refugee_df.origin != 'Stateless')]

# We also drop the returned columns because it is symptomatic of past refugee and doesn't really fit in our analysis
try:
    raw_refugee_df.drop(['returned_refugee', 'returned_idp'], axis=1, inplace=True)
except: # avoid error if we re-run this code
    pass

display(raw_refugee_df.head(5))

In [None]:
REFUGEE_COLUMNS = ["year", "origin", "refugee", "asylum", "internally_displaced", "stateless", "others", "total"]
refugee_df = pd.DataFrame(columns=REFUGEE_COLUMNS)

for year in tqdm(raw_refugee_df.year.unique()):
    for origin in raw_refugee_df[raw_refugee_df.year == year].origin.unique():
        index = (raw_refugee_df.year == year) & (raw_refugee_df.origin == origin)
        temp_df_no_dest = raw_refugee_df[index].drop(["country_dest"], axis=1)
        sum_series = temp_df_no_dest.sum(numeric_only=True)
        # drop the row if the column of interest are zero (except total, because it might take into account returned)
        if (sum_series[1:5] == 0).all():
            continue
            
        sum_series["year"] = year
        sum_series["origin"] = origin
        sum_series["total"] = sum_series[1:5].sum()
        
        refugee_df = refugee_df.append(sum_series, ignore_index=True)
        
display(refugee_df.head(5))

In [None]:
with open('refugee.pickle', 'wb') as out:
    pickle.dump(refugee_df, out)

In [None]:
with open('refugee.pickle', 'rb') as data_source:
    refugee_df = pickle.load(data_source)