In [1]:
import pandas as pd
import numpy as np
!pip install openpyxl



In [2]:
excel_name = 'https://github.com/shionguha/inf1340-programmingfordatascience-fa22/raw/60b7f5d757553308a4b5db8c439c360ea244e07d/project%20data/UN_MigrantStockTotal_2015.xlsx'

In [3]:
def update_one_table(table, sheet_title, gender):
    table = table.rename(columns={'Unnamed: 3': 'Country Code'})
    
    table_gender = table[['Country Code', 1990, 1995, 2000, 2005, 2010, 2015]]
    # Move year to cells
    table_gender = table_gender.melt(id_vars = "Country Code", var_name = "year", value_name = sheet_title)
    # Create a gender column and fill with both sexes
    table_gender['gender'] = gender
    
    table_gender = table_gender.replace('..', np.nan).astype({"year": int, "gender": 'string', sheet_title: float})
    
    return table_gender
    

def update_tables(table, sheet_title):
    table.rename(columns={'Unnamed: 3': 'Country Code'}, inplace=True)
    
    # process both sexes
    table_both = table[['Country Code', 1990, 1995, 2000, 2005, 2010, 2015]]
    # Move year to cells
    table_both = table_both.melt(id_vars = "Country Code", var_name = "year", value_name = sheet_title)
    # Create a gender column and fill with both sexes
    table_both['gender'] = 'both sexes'
    
    # process male
    male_columns = ['1990.1', '1995.1', '2000.1', '2005.1', '2010.1', '2015.1']
    table_male = table[['Country Code'] + male_columns]
    table_male = table_male.rename(columns={col: int(col[:-2]) for col in male_columns})
    # Move year to cells
    table_male = table_male.melt(id_vars = "Country Code", var_name = "year", value_name = sheet_title)
    # Create a gender column and fill with male
    table_male['gender'] = 'male'
    
    # process female
    female_columns = ['1990.2', '1995.2', '2000.2', '2005.2', '2010.2', '2015.2']
    table_female = table[['Country Code'] + female_columns]
    table_female = table_female.rename(columns={col: int(col[:-2]) for col in female_columns})
    # Move year to cells
    table_female = table_female.melt(id_vars = "Country Code", var_name = "year", value_name = sheet_title)
    # Create a gender column and fill with female
    table_female['gender'] = 'female'
    
    # concat the data for all genders
    table = pd.concat([table_both, table_male, table_female])
    table = table.replace('..', np.nan).astype({"year": int, "gender": 'string', sheet_title: float})
    return table

# Process Table 1

In [4]:
table = pd.read_excel(excel_name, sheet_name="Table 1", header=15)
table1 = update_tables(table, 'International migrant stock at mid-year')
table1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4770 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                   Non-Null Count  Dtype  
---  ------                                   --------------  -----  
 0   Country Code                             4770 non-null   int64  
 1   year                                     4770 non-null   int64  
 2   International migrant stock at mid-year  4725 non-null   float64
 3   gender                                   4770 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 186.3 KB


# Process Table 2

In [5]:
table = pd.read_excel(excel_name, sheet_name="Table 2", header=15)
table2 = update_tables(table, 'Total population at mid-year')
table2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4770 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Country Code                  4770 non-null   int64  
 1   year                          4770 non-null   int64  
 2   Total population at mid-year  4386 non-null   float64
 3   gender                        4770 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 186.3 KB


# Process Table 3

In [6]:
table = pd.read_excel(excel_name, sheet_name="Table 3", header=15)
table3 = update_tables(table, 'International migrant stock as a percentage of the total population')
table3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4770 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Country Code                                                         4770 non-null   int64  
 1   year                                                                 4770 non-null   int64  
 2   International migrant stock as a percentage of the total population  4343 non-null   float64
 3   gender                                                               4770 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 186.3 KB


# Process Table 4

In [7]:
table4 = pd.read_excel(excel_name, sheet_name='Table 4', header=15)
# Only female is needed, so call update_one_table
table4 = update_one_table(table4, 'Migrants as a percentage of the international migrant stock', 'female')
table4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Country Code                                                 1590 non-null   int64  
 1   year                                                         1590 non-null   int64  
 2   Migrants as a percentage of the international migrant stock  1575 non-null   float64
 3   gender                                                       1590 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 49.8 KB


# Process Table 5

In [8]:
table5 = pd.read_excel(excel_name, sheet_name="Table 5", header=15)
# rename columns year1-year5 -> year5
renames = {}
for col in table5.columns:
    new_col = col
    if col.find('-') > -1:
        new_col = col[5:]
        if col.find('.') == -1:
            new_col = int(new_col)
    renames[col] = new_col
renames     

{'Unnamed: 0': 'Unnamed: 0',
 'Unnamed: 1': 'Unnamed: 1',
 'Unnamed: 2': 'Unnamed: 2',
 'Unnamed: 3': 'Unnamed: 3',
 'Unnamed: 4': 'Unnamed: 4',
 '1990-1995': 1995,
 '1995-2000': 2000,
 '2000-2005': 2005,
 '2005-2010': 2010,
 '2010-2015': 2015,
 '1990-1995.1': '1995.1',
 '1995-2000.1': '2000.1',
 '2000-2005.1': '2005.1',
 '2005-2010.1': '2010.1',
 '2010-2015.1': '2015.1',
 '1990-1995.2': '1995.2',
 '1995-2000.2': '2000.2',
 '2000-2005.2': '2005.2',
 '2005-2010.2': '2010.2',
 '2010-2015.2': '2015.2'}

In [9]:
table5.rename(columns=renames, inplace=True)
# set data for 1990 with NaN
table5[1990] = np.nan
table5['1990.1'] = np.nan
table5['1990.2'] = np.nan
table5.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
               1995,         2000,         2005,         2010,         2015,
           '1995.1',     '2000.1',     '2005.1',     '2010.1',     '2015.1',
           '1995.2',     '2000.2',     '2005.2',     '2010.2',     '2015.2',
               1990,     '1990.1',     '1990.2'],
      dtype='object')

In [10]:
table5 = update_tables(table5, 'Annual rate of change of the migrant stock for last five years')
table5.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4770 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Country Code                                                    4770 non-null   int64  
 1   year                                                            4770 non-null   int64  
 2   Annual rate of change of the migrant stock for last five years  3930 non-null   float64
 3   gender                                                          4770 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 186.3 KB


# Process table 6

In [11]:
table6 = pd.read_excel(excel_name, sheet_name="Table 6", header=15)
table6.columns

Index(['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4',
               1990,         1995,         2000,         2005,         2010,
               2015,     '1990.1',     '1995.1',     '2000.1',     '2005.1',
           '2010.1',     '2015.1',  '1990-1995',  '1995-2000',  '2000-2005',
        '2005-2010',  '2010-2015'],
      dtype='object')

In [12]:
# First sub-table for "Estimated refugee stock at mid-year"
table6_1 = table6[['Unnamed: 3', 1990, 1995, 2000, 2005, 2010, 2015]]
table6_1 = update_one_table(table6_1, 'Estimated refugee stock at mid-year', 'both sexes')
table6_1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Country Code                         1590 non-null   int64  
 1   year                                 1590 non-null   int64  
 2   Estimated refugee stock at mid-year  1579 non-null   float64
 3   gender                               1590 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 49.8 KB


In [13]:
# Second sub-table for "Refugees as a percentage of the international migrant stock"
table6_2 = table6[['Unnamed: 3', '1990.1', '1995.1', '2000.1', '2005.1', '2010.1', '2015.1']]
# Remove .1 such that columns are reconginzed by update_one_table
table6_2 = table6_2.rename(columns={col: (int(col[:-2]) if col.find('.') > -1 else col) for col in table6_2.columns})
table6_2 = update_one_table(table6_2, 'Refugees as a percentage of the international migrant stock', 'both sexes')
table6_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                                       Non-Null Count  Dtype  
---  ------                                                       --------------  -----  
 0   Country Code                                                 1590 non-null   int64  
 1   year                                                         1590 non-null   int64  
 2   Refugees as a percentage of the international migrant stock  1575 non-null   float64
 3   gender                                                       1590 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 49.8 KB


In [14]:
# Third sub-table for "Annual rate of change of the refugee stock"
table6_3 = table6[['Unnamed: 3', '1990-1995', '1995-2000', '2000-2005', '2005-2010', '2010-2015']]
# Rename columns year1-year5 to year5
table6_3 = table6_3.rename(columns={col: (int(col[5:]) if col.find('-') > -1 else col) for col in table6_3.columns})
# Set 1990 to NaN
table6_3[1990] = np.nan
table6_3 = update_one_table(table6_3, 'Annual rate of change of the refugee stock for last five years', 'both sexes')
table6_3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1590 entries, 0 to 1589
Data columns (total 4 columns):
 #   Column                                                          Non-Null Count  Dtype  
---  ------                                                          --------------  -----  
 0   Country Code                                                    1590 non-null   int64  
 1   year                                                            1590 non-null   int64  
 2   Annual rate of change of the refugee stock for last five years  890 non-null    float64
 3   gender                                                          1590 non-null   string 
dtypes: float64(1), int64(2), string(1)
memory usage: 49.8 KB


# Merge tables

In [15]:
table_all = pd.merge(table1, table2, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table3, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table4, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table5, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table6_1, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table6_2, how="outer", on=['Country Code', 'year', 'gender'])
table_all = pd.merge(table_all, table6_3, how="outer", on=['Country Code', 'year', 'gender'])
table_all.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4770 entries, 0 to 4769
Data columns (total 11 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   Country Code                                                         4770 non-null   int64  
 1   year                                                                 4770 non-null   int64  
 2   International migrant stock at mid-year                              4725 non-null   float64
 3   gender                                                               4770 non-null   string 
 4   Total population at mid-year                                         4386 non-null   float64
 5   International migrant stock as a percentage of the total population  4343 non-null   float64
 6   Migrants as a percentage of the international migrant stock          1575 non-null   float64
 7   Annual

In [16]:
table_all

Unnamed: 0,Country Code,year,International migrant stock at mid-year,gender,Total population at mid-year,International migrant stock as a percentage of the total population,Migrants as a percentage of the international migrant stock,Annual rate of change of the migrant stock for last five years,Estimated refugee stock at mid-year,Refugees as a percentage of the international migrant stock,Annual rate of change of the refugee stock for last five years
0,900,1990,152563212.0,both sexes,5309667.699,2.873310,,,18836571.0,12.346732,
1,901,1990,82378628.0,both sexes,1144463.062,7.198015,,,2014564.0,2.445494,
2,902,1990,70184584.0,both sexes,4165204.637,1.685021,,,16822007.0,23.968236,
3,941,1990,11075966.0,both sexes,510057.629,2.171513,,,5048391.0,45.565880,
4,934,1990,59105261.0,both sexes,3655147.008,1.617042,,,11773616.0,19.919743,
...,...,...,...,...,...,...,...,...,...,...,...
4765,882,2015,2460.0,female,93.584,2.628654,49.908704,-0.545343,,,
4766,772,2015,254.0,female,,,52.156057,2.603250,,,
4767,776,2015,2604.0,female,52.931,4.919612,45.437096,2.526318,,,
4768,798,2015,63.0,female,,,44.680851,-1.819436,,,
