In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
pip install openpyxl

In [None]:
df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name="Table 1")

In [None]:
## identifiers and migrant stock sex and year are not in the same row
## read from two rows and merge the sex info to the year
ids = list(df.iloc[13])[:5]
new_cols = ["both", "male", "female"]
values = [str(int(value)) + new_cols[i // 6] for i, value in enumerate(list(df.iloc[14])[5:])]
df.columns = ids + values
## drop the header rows
df = df.drop(df.index[0:15])

In [None]:
## merge the data on year/sex row
df = df.melt(id_vars=ids, var_name="year/sex", value_name="tmp")

In [None]:
## since two variables is stored inside the indicator variable above, split the variable
df = df.assign(Year=lambda x: x["year/sex"].str[:4],
               Sex=lambda x: x["year/sex"].str[4:],
               Migrant_Stock=lambda x: x["tmp"])
## drop intermidate columns and columns that are not informative
df = df.drop(columns=["Notes", "year/sex", "tmp"])

In [None]:
## problem found in stock, replace .. with NaN
df["Migrant_Stock"] = df["Migrant_Stock"].replace('..', np.nan)
## maintain sort order
df = df.sort_values(by=["Sort\norder", "Year", "Sex"], ignore_index=True).drop(columns=["Sort\norder"])
df.head()
# print(df.groupby(by=["Country code", "Year"]).sum().head())
sns.barplot(data=df[df["Country code"]==900], x="Year", y="Migrant_Stock", hue="Sex")
# sns.barplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.title(label="WORLD Migrant Stock")
plt.show()

In [None]:
 df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name = "Table 2")

## read from two rows and merge the sex info to the year
ids = list(df.iloc[13])[:4]
new_cols = ["both", "male", "female"]
values = [str(int(value)) + new_cols[i//6] for i,value in enumerate(list(df.iloc[14])[4:])]
df.columns = ids + values
## drop the header rows
df = df.drop(df.index[0:15])

In [None]:
## merge the data on year/sex row
df = df.melt(id_vars = ids, var_name = "year/sex", value_name="tmp")

## since two variables is stored inside the indicator variable above, split the variable
df = df.assign(Year = lambda x: x["year/sex"].str[:4],
               Sex = lambda x: x["year/sex"].str[4:],
               Population = lambda x: x["tmp"])
## drop intermidate columns and columns that are not informative
df = df.drop(columns=["Notes", "year/sex", "tmp"])

In [None]:
## problem found in stock, replace .. with NaN
df["Population"] = df["Population"].replace('..', np.nan)
## maintain sort order
df = df.sort_values(by=["Sort\norder", "Year", "Sex"], ignore_index=True).drop(columns=["Sort\norder"])
df.head()
sns.pointplot(data=df[df["Country code"]==900], x="Year", y="Population", hue="Sex")
# sns.pointplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.title(label="WORLD Population")
plt.show()

In [None]:
df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name="Table 3")

## read from two rows and merge the sex info to the year
ids = list(df.iloc[13])[:5]
new_cols = ["both", "male", "female"]
values = [str(int(value)) + new_cols[i // 6] for i, value in enumerate(list(df.iloc[14])[5:])]
df.columns = ids + values
## drop the header rows
df = df.drop(df.index[0:15])

In [None]:
## merge the data on year/sex row
df = df.melt(id_vars=ids, var_name="year/sex", value_name="tmp")

## since two variables is stored inside the indicator variable above, split the variable
df = df.assign(Year=lambda x: x["year/sex"].str[:4],
               Sex=lambda x: x["year/sex"].str[4:])
## drop intermidate columns and columns that are not informative
df = df.drop(columns=["Notes", "year/sex"])

In [None]:
## problem found in stock, replace .. with NaN
df["Stock/Population"] = df["tmp"].replace('..', np.nan)
## maintain sort order
df = df.sort_values(by=["Sort\norder", "Year", "Sex"], ignore_index=True).drop(columns=["Sort\norder", "tmp"])
df.head()
sns.boxplot(data=df[df["Country code"] == 900], x="Year", y="Stock/Population")
# sns.boxplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.title(label="WORLD Stock/Population")
plt.show()

In [None]:
df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name = "Table 4")

ids = list(df.iloc[13])[:5]
values = list(df.iloc[14])[5:]
df.columns = ids + values
## drop the header rows
df = df.drop(df.index[0:15])

In [None]:
df = df.melt(id_vars = ids, var_name = "Year", value_name="tmp")
df["Year"] = df["Year"].astype(int)
## drop intermidate columns and columns that are not informative
df = df.drop(columns=["Notes"])

## problem found in stock, replace .. with NaN
df["Female Stock/Population"] = df["tmp"].replace('..', np.nan)
## maintain sort order
df = df.sort_values(by=["Sort\norder", "Year"], ignore_index=True).drop(columns=["Sort\norder", "tmp"])

In [None]:
df.head()
sns.lineplot(data=df[df["Country code"] == 900], x="Year", y="Female Stock/Population")
# sns.lineplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.title(label="WORLD Female Stock/Population")
plt.show()

In [None]:
df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name = "Table 5")

## read from two rows and merge the sex info to the year
ids = list(df.iloc[13])[:5]
new_cols = ["both", "male", "female"]
values = [value + new_cols[i//5] for i,value in enumerate(list(df.iloc[14])[5:])]
df.columns = ids + values
## drop the header rows
df = df.drop(df.index[0:15])

In [None]:
## merge the data on year/sex row
df = df.melt(id_vars = ids, var_name = "period/sex", value_name="tmp")

## since two variables is stored inside the indicator variable above, split the variable
df = df.assign(Period = lambda x: x["period/sex"].str[:9],
               Sex = lambda x: x["period/sex"].str[9:])
## drop intermidate columns and columns that are not informative
df = df.drop(columns=["Notes", "period/sex"])

In [None]:
## problem found in stock, replace .. with NaN
df["Annual rate of change for Stock"] = df["tmp"].replace('..', np.nan)
## maintain sort order
df = df.sort_values(by=["Sort\norder", "Period", "Sex"], ignore_index=True).drop(columns=["Sort\norder", "tmp"])
df.head()
sns.lineplot(data=df[df["Country code"] == 900], x="Period", y="Annual rate of change for Stock")
# sns.lineplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.title(label="WORLD Female Annual rate of change for Stock")
plt.show()

In [None]:
df = pd.read_excel("UN_MigrantStockTotal_2015.xlsx", sheet_name = "Table 6")

## identifiers and migrant stock sex and year are not in the same row
## read from two rows and merge the sex info to the year
ids = list(df.iloc[13])[:5]
stock_values = ["stock"+str(int(value)) for value in list(df.iloc[14])[5:11]]
per_values = ["perce"+str(int(value)) for value in list(df.iloc[14])[11:17]]
change_values = ["chang"+value for value in list(df.iloc[14])[17:]]

df.columns = ids + stock_values + per_values + change_values


In [None]:
## drop the header rows
df = df.drop(df.index[0:15])
df_stock = df.drop(columns = per_values + change_values)
df_percentage = df.drop(columns = stock_values + change_values)
df_rc = df.drop(columns=stock_values + per_values)
df_stock = df_stock.melt(id_vars = ids, var_name = "Year", value_name="tmp")
df_stock["Year"] = df_stock["Year"].str[5:]
## drop intermidate columns and columns that are not informative
df_stock = df_stock.drop(columns=["Notes"])

In [None]:
## problem found in stock, replace .. with NaN
df_stock["Refugee Stock"] = df_stock["tmp"].replace('..', np.nan)
## maintain sort order
df_stock = df_stock.sort_values(by=["Sort\norder", "Year"], ignore_index=True)
df_stock = df_stock.drop(columns=["Sort\norder", "tmp"])
df_stock.head()

df_percentage = df_percentage.melt(id_vars = ids, var_name = "Year", value_name="tmp")
df_percentage["Year"] = df_percentage["Year"].str[5:]
## drop intermidate columns and columns that are not informative
df_percentage = df_percentage.drop(columns=["Notes"])

In [None]:
## problem found in stock, replace .. with NaN
df_percentage["Refugee Stock/Population"] = df_percentage["tmp"].replace('..', np.nan)
## maintain sort order
df_percentage = df_percentage.sort_values(by=["Sort\norder", "Year"], ignore_index=True)
df_percentage = df_percentage.drop(columns=["Sort\norder", "tmp"])
df_percentage.head()

df_rc = df_rc.melt(id_vars = ids, var_name = "Period", value_name="tmp")
df_rc["Period"] = df_rc["Period"].str[5:]
## drop intermidate columns and columns that are not informative
df_rc = df_rc.drop(columns=["Notes"])

In [None]:
## problem found in stock, replace .. with NaN
df_rc["Refugee Stock Rate of Change"] = df_rc["tmp"].replace('..', np.nan)
## maintain sort order
df_rc = df_rc.sort_values(by=["Sort\norder", "Period"], ignore_index=True)
df_rc = df_rc.drop(columns=["Sort\norder", "tmp"])
df_rc.head()

sns.set()
fig,ax = plt.subplots(1,2)
sns.barplot(data=df_stock[df_stock["Country code"] == 900], x="Year", y="Refugee Stock",ax=ax[0])
sns.pointplot(data=df_percentage[df_percentage["Country code"] == 900], x="Year", y="Refugee Stock/Population",ax=ax[1])
ax[0].set_title("WORLD Refugee Stock")
ax[1].set_title("WORLD Refugee Stock/Population")
# sns.barplot(data=df[df["Sex"] == "both"], x="Country code", y="Migrant_Stock", hue="Year")
plt.tight_layout()
plt.show()

sns.lineplot(data=df_rc[df_rc["Country code"] == 900], x="Period", y="Refugee Stock Rate of Change")
plt.title(label="WORLD Refugee Stock Rate of Change")
plt.tight_layout()
plt.show()
