In [None]:
https://colab.research.google.com/drive/1X14qyoCgksP7WJdPXR_YJ5Wew2AYCGKH

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly

In [None]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [None]:
#Problem: Column names are values not variable names
#tidy data principle #1: Column names need to be informative, variable names and not values

#problem: there are multiple variables stored in 1 column
#tidy data principle #2: each column needs to consist of one and only one variable

#problem: variables are stored in both rows and columns
#tidy data principle #3: variables need to be in cells, not rows and columns

#problem: there are multiple types of data stored in 1 table
#tidy data principle #4: each table column needs to have a singular data type 

#tidy data principle #5: a single observational units must be in 1 table

In [None]:
# Table 1
# Step 1: Read the table, drop meaningless lines and reallocate column names
UN_1 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 1')
UN_1.drop(UN_1.index[0:15], inplace=True)
UN_1.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)", "b1990", "b1995", "b2000", "b2005", "b2010",
               "b2015", "m1990", "m1995", "m2000", "m2005", "m2010", "m2015", "f1990", "f1995", "f2000",
               "f2005", "f2010", "f2015"]
UN_1.head()

Unnamed: 0,Sort order,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),b1990,b1995,b2000,b2005,b2010,...,m2000,m2005,m2010,m2015,f1990,f1995,f2000,f2005,f2010,f2015
15,1,WORLD,,900,,152563212,160801752,172703309,191269100,221714243.0,...,87884839,97866674,114613714.0,126115435.0,74815702,79064275,84818470,93402426,107100529.0,117584801.0
16,2,Developed regions,(b),901,,82378628,92306854,103375363,117181109,132560325.0,...,50536796,57217777,64081077.0,67618619.0,42115231,47214055,52838567,59963332,68479248.0,72863336.0
17,3,Developing regions,(c),902,,70184584,68494898,69327946,74087991,89153918.0,...,37348043,40648897,50532637.0,58496816.0,32700471,31850220,31979903,33439094,38621281.0,44721465.0
18,4,Least developed countries,(d),941,,11075966,11711703,10077824,9809634,10018128.0,...,5361902,5383009,5462714.0,6463217.0,5236216,5573685,4721920,4432371,4560536.0,5493028.0
19,5,Less developed regions excluding least develop...,,934,,59105261,56778501,59244124,64272611,79130668.0,...,31986141,35265888,45069923.0,52033599.0,27464255,26276535,27257983,29006723,34060745.0,39228437.0


In [None]:
# Step 2: melt the data to make all columns be variables
UN_1 = UN_1.melt(id_vars = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)"], var_name = "genderyear",
             value_name = "International migrant stock at mid-year")
UN_1.head()

Unnamed: 0,Sort order,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),genderyear,International migrant stock at mid-year
0,1,WORLD,,900,,b1990,152563212
1,2,Developed regions,(b),901,,b1990,82378628
2,3,Developing regions,(c),902,,b1990,70184584
3,4,Least developed countries,(d),941,,b1990,11075966
4,5,Less developed regions excluding least develop...,,934,,b1990,59105261


In [None]:
# Step 3: Split genderyear column to two column with variables gender and year seperately
UN_1 = UN_1.assign(gender = lambda x: x.genderyear.str[0].astype(str), year = lambda x: x.genderyear.str[1:].astype(str))
UN_1 = UN_1.drop("genderyear", axis = 1)
UN_1.head()

Unnamed: 0,Sort order,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),International migrant stock at mid-year,gender,year
0,1,WORLD,,900,,152563212,b,1990
1,2,Developed regions,(b),901,,82378628,b,1990
2,3,Developing regions,(c),902,,70184584,b,1990
3,4,Least developed countries,(d),941,,11075966,b,1990
4,5,Less developed regions excluding least develop...,,934,,59105261,b,1990


In [None]:
# Step 4: Change the names to proper form and reset the index
UN_1 = UN_1.replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_1 = UN_1.set_index("Sort order")

UN_1.head()

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),International migrant stock at mid-year,gender,year
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,WORLD,,900,,152563212,both sexes,1990
2,Developed regions,(b),901,,82378628,both sexes,1990
3,Developing regions,(c),902,,70184584,both sexes,1990
4,Least developed countries,(d),941,,11075966,both sexes,1990
5,Less developed regions excluding least develop...,,934,,59105261,both sexes,1990


In [None]:
# Table 2
UN_2 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 2')
UN_2.drop(UN_2.index[0:15], inplace=True)
UN_2.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "b1990", "b1995", "b2000", "b2005", "b2010",
               "b2015", "m1990", "m1995", "m2000", "m2005", "m2010", "m2015", "f1990", "f1995", "f2000",
               "f2005", "f2010", "f2015"]

UN_2 = UN_2.melt(id_vars = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code"], var_name = "genderyear",
             value_name = "Total population at mid-year (thousands)")

UN_2 = UN_2.assign(gender = lambda x: x.genderyear.str[0].astype(str), year = lambda x: x.genderyear.str[1:].astype(str))
UN_2 = UN_2.drop("genderyear", axis = 1)

UN_2 = UN_2.replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_2 = UN_2.set_index("Sort order")
UN_2.head()

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Total population at mid-year (thousands),gender,year
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,WORLD,,900,5309667.699,both sexes,1990
2,Developed regions,(b),901,1144463.062,both sexes,1990
3,Developing regions,(c),902,4165204.637,both sexes,1990
4,Least developed countries,(d),941,510057.629,both sexes,1990
5,Less developed regions excluding least develop...,,934,3655147.008,both sexes,1990


In [None]:
# Table 3
UN_3 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 3')
UN_3.drop(UN_3.index[0:15], inplace=True)
UN_3.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)", "b1990", "b1995", "b2000", "b2005", "b2010",
               "b2015", "m1990", "m1995", "m2000", "m2005", "m2010", "m2015", "f1990", "f1995", "f2000",
               "f2005", "f2010", "f2015"]

UN_3 = UN_3.melt(id_vars = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)"], var_name = "genderyear",
             value_name = "International migrant stock as a percentage of the total population")

UN_3 = UN_3.assign(gender = lambda x: x.genderyear.str[0].astype(str), year = lambda x: x.genderyear.str[1:].astype(str))
UN_3 = UN_3.drop("genderyear", axis = 1)
UN_3 = UN_3.replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_3 = UN_3.set_index("Sort order")

UN_3.tail()

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),International migrant stock as a percentage of the total population,gender,year
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
261,Samoa,,882,B,2.628654,female,2015
262,Tokelau,,772,B,..,female,2015
263,Tonga,,776,B,4.919612,female,2015
264,Tuvalu,,798,C,..,female,2015
265,Wallis and Futuna Islands,,876,B,..,female,2015


In [None]:
# Table 4
# Step 1: Read the table, drop meaningless lines and reallocate column names
UN_4 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 4')
UN_4.drop(UN_4.index[0:15], inplace=True)
UN_4.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)", "f1990", "f1995", "f2000",
               "f2005", "f2010", "f2015"]

# Step 2: Complete the other half of the information (male) in the table
append_header_list = ["b1990", "b1995", "b2000", "b2005", "b2010", "b2015",
                      "m1990", "m1995", "m2000","m2005", "m2010", "m2015"]
for i in range(len(append_header_list)):
    col_name = UN_4.columns.tolist()
    col_name.insert(5+i, append_header_list[i])
    UN_4 = UN_4.reindex(columns=col_name)
    if i > 5:
        for index in range(len(UN_4.iloc[:,5+i])):
            if UN_4.iloc[index,2*i] != "..":
                UN_4.iloc[index,5+i] = 100 - UN_4.iloc[index,2*i]

# Step 3: Melt the data to make all columns be variables
UN_4 = UN_4.melt(id_vars = ["Sort order", "Major area, region, country or area of destination",
                            "Notes", "Country code", "Type of data (a)"], var_name = "genderyear",
                 value_name = "Migrants as a percentage of the international migrant stock")

# Step 4: Split genderyear column to two column with variables gender and year seperately
UN_4 = UN_4.assign(gender = lambda x: x.genderyear.str[0].astype(str), year = lambda x: x.genderyear.str[1:].astype(str))
UN_4 = UN_4.drop("genderyear", axis = 1)

# Step 5: Change the names to proper form and reset the index
UN_4 = UN_4.replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_4 = UN_4.set_index("Sort order")

UN_4.sample(10)

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),Migrants as a percentage of the international migrant stock,gender,year
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
44,Sudan,(3),729,B R,,both sexes,1995
148,Latvia,,428,B,,both sexes,2000
259,French Polynesia,,258,B,42.993867,female,2005
83,Japan,,392,C,47.254393,male,2000
23,Somalia,,706,I R,45.593294,female,2015
117,Kuwait,,414,C R,67.531482,male,2000
226,Guyana,,328,B,,both sexes,2015
90,Lao People's Democratic Republic,,418,C R,,both sexes,2000
115,Israel,,376,B R,52.791382,female,1990
257,American Samoa,,16,B,47.493305,female,1990


In [None]:
# Table 5
UN_5 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 5')
UN_5.drop(UN_5.index[0:15], inplace=True)

# Marked Step: This step is to reallocate the values so that the variable "year" has same attributes as other tables
UN_5.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)", "b1995", "b2000", "b2005", "b2010",
               "b2015", "m1995", "m2000", "m2005", "m2010", "m2015", "f1995", "f2000",
               "f2005", "f2010", "f2015"]

append_header_list = ["b1990", "m1990", "f1990"]

for i in range(3):
    col_name = UN_5.columns.tolist()
    col_name.insert(5+6*i, append_header_list[i])
    UN_5 = UN_5.reindex(columns=col_name)

UN_5 = UN_5.melt(id_vars = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)"], var_name = "genderyear",
             value_name = "Annual rate of change of the migrant stock in past 5 years")
    
UN_5 = ((UN_5.assign(gender = lambda x: x.genderyear.str[0].astype(str), year =
                     lambda x: x.genderyear.str[1:].astype(str))).drop("genderyear", axis = 1)
       ).replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_5 = UN_5.set_index("Sort order")

UN_5.sample(10)

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),Annual rate of change of the migrant stock in past 5 years,gender,year
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
263,Tonga,,776,B,1.787022,female,1995
119,Oman,,512,C,2.594227,female,2010
84,Mongolia,,496,C,8.997419,male,2010
94,Singapore,,702,B,6.198016,both sexes,2000
33,Chad,,148,B R,3.381933,both sexes,2010
101,Bhutan,,64,B,,female,1990
232,Northern America,,905,,2.118884,male,2010
32,Central African Republic,,140,C,-5.391552,female,2005
23,Somalia,,706,I R,0.572211,both sexes,2005
114,Iraq,,368,C R,-9.197892,both sexes,2005


In [None]:
# Table 6
# Step 1: Read the table, drop meaningless lines and reallocate column names
UN_6 = pd.read_excel('UN_MigrantStockTotal_2015.xlsx', sheet_name = 'Table 6')
UN_6.drop(UN_6.index[0:15], inplace=True)

UN_6.columns = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)", "eb1990", "eb1995", "eb2000", "eb2005", "eb2010",
               "eb2015", "rb1990", "rb1995", "rb2000", "rb2005", "rb2010", "rb2015", "ab1995", "ab2000",
               "ab2005", "ab2010", "ab2015"]

# Marked Step 1.5
col_name = UN_6.columns.tolist()
col_name.insert(17, "ab1990")
UN_6 = UN_6.reindex(columns=col_name)

# Step 2: melt the data to make all columns be variables
UN_6 = UN_6.melt(id_vars = ["Sort order", "Major area, region, country or area of destination", "Notes",
               "Country code", "Type of data (a)"], var_name = "typegenderyear",
             value_name = "value")

# Step 3: Split typegenderyear column to three column with variables variable types, gender and year seperately.
# Then change the names to proper form and reset the index
UN_6 = ((UN_6.assign(type = lambda x: x.typegenderyear.str[0].astype(str), gender = lambda x: x.typegenderyear.str[1].astype(str),
                     year = lambda x: x.typegenderyear.str[2:].astype(str))).drop("typegenderyear", axis = 1)
       ).replace(to_replace=["b", "m", "f"], value=["both sexes", "male", "female"])
UN_6 = UN_6.set_index("Sort order")

# Step 4: Convert the three variables in the "type" column to three distinct columns with one variable.
append_list = [UN_6["value"][UN_6.type == "r"], UN_6["value"][UN_6.type == "a"]]

UN_6 = UN_6[UN_6.type == "e"].drop("type", axis = 1).rename(columns={"value":"Estimated refugee stock at mid-year"})

UN_6["Refugees as a percentage of the international migrant stock"] = append_list[0]
UN_6["Annual rate of change of the refugee stock in past 5 years"] = append_list[1]

UN_6.sample(10)

Unnamed: 0_level_0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),Estimated refugee stock at mid-year,gender,year,Refugees as a percentage of the international migrant stock,Annual rate of change of the refugee stock in past 5 years
Sort order,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
25,Uganda,,800,B R,145718.0,both sexes,1990,26.099977,
60,Guinea,,324,C R,63525.0,both sexes,2005,27.666357,-20.282973
182,Anguilla,,660,B,0.0,both sexes,2010,0.0,..
44,Sudan,(3),729,B R,1031050.0,both sexes,1990,73.4944,
231,Venezuela (Bolivarian Republic of),,862,B,1613.0,both sexes,1995,0.158138,-1.532346
43,Morocco,,504,C,792.0,both sexes,2010,1.116925,20.401407
265,Wallis and Futuna Islands,,876,B,0.0,both sexes,2015,0.0,..
54,Benin,,204,C B R,219.0,both sexes,2015,0.089242,-72.870615
169,The former Yugoslav Republic of Macedonia,,807,B,959.0,both sexes,2010,0.739393,-5.996645
240,Australia,(22),36,B,71246.0,both sexes,2000,1.624303,-3.177964


In [None]:
# Now combining the six tables (maybe unnecessary)
UN_list = [UN_1, UN_2, UN_3, UN_4, UN_5, UN_6]

UN = UN_1
name_list = UN.columns.values.tolist()
name_list.append(name_list[4])
name_list.pop(4)
UN = UN[name_list]

for i in range(len(UN_list)-1):
    UN = pd.merge(UN, UN_list[i+1], how="outer")

UN.sample(10)

Unnamed: 0,"Major area, region, country or area of destination",Notes,Country code,Type of data (a),gender,year,International migrant stock at mid-year,Total population at mid-year (thousands),International migrant stock as a percentage of the total population,Migrants as a percentage of the international migrant stock,Annual rate of change of the migrant stock in past 5 years,Estimated refugee stock at mid-year,Refugees as a percentage of the international migrant stock,Annual rate of change of the refugee stock in past 5 years
2322,Saint Vincent and the Grenadines,,670,B,male,2000,2099.0,54.332,3.863285,48.734618,0.856579,,,
4309,Togo,,768,C R,female,2010,125982.0,3242.855,3.88491,49.353997,4.519228,,,
2418,Congo,,178,B,male,2005,171420.0,1749.984,9.795518,54.377962,2.317283,,,
2198,China,(5),156,C,male,2000,254082.0,652374.453,0.038947,50.012794,2.608091,,,
609,"China, Hong Kong Special Administrative Region",(6),344,B,both sexes,2000,2669122.0,6783.502,39.347258,,1.763924,979.0,0.036679,-36.413841
2583,Montserrat,,500,B,male,2005,639.0,..,..,51.366559,0.539287,,,
140,Denmark,,208,B,both sexes,1990,235189.0,5140.332,4.575366,,,31827.0,13.532521,
2821,Belgium,,56,C,male,2010,534608.0,5355.795,9.981861,50.777513,3.637305,,,
128,Belarus,,112,B,both sexes,1990,1248977.0,10231.983,12.206598,,,0.0,0.0,
3038,Turkey,,792,B R,male,2015,1514468.0,38674.562,3.915928,51.079626,17.670661,,,
