In [419]:
import pandas as pd
import numpy as np
import re
!pip install openpyxl

import warnings
warnings.filterwarnings("ignore")



In [420]:
#intialize a dictionary for all the tables as keys and information we extract from each 
#table as its value
#IMS is International migrant stock
#RMS refugee migrant stock

dict_ = {'Table 1' : 'IMS' , 
         'Table 2' : 'total_population', 
         'Table 3' : 'IMS/total_population_%age' , 
         'Table 4' : 'Female/IMS_%age', 
         'Table 5' : 'annual_rate_of_change_IMS' , 
         'Table 6' : ['RMS' , 'RMS/IMS_%age','annual_rate_of_change_RMS']}

In [423]:
#define a function to rename columns as it is a repetitive task
#all columns are named as 1,2,3.. when we load dataframe in python so 'i' is the starting value
#loop will start from column 'i'

def rename_columns(df1):
    #label read the value from df where the column header from excel file is stored
    label = df1.iloc[0][df1.columns[1]]
    
    #condition to check if label empty because table 4 does not have 'type of data' column so
    #it would have different index
    if(pd.isna(label)): 
        label = df1.iloc[0][df1.columns[1]]
        #decrement i as columns is 1 less than the other tables
    
    #loop starting from second column till end because we dont want to rename country code
    lst = list(df1.columns[1:len(df1.columns)])
    
    for each in lst:
        #condition to check if first index(column header) is still the same
        #when value at 0 index changes label will be updated with the new value
        if(pd.notnull(df1.iloc[0][each])):
            label = df1.iloc[0][each]
       
        #code to rename column
        df1.rename({each : str(df1.iloc[1][each])+str(label)} , axis = 'columns' , inplace = True)
        
    return df1


In [427]:
#funtion to clean table

def clean_table(df , key): 
    #remove first 13 rows as the contain the header of the table and lots of nan values
    df.drop(df.index[0:13] , inplace = True)
    
    #rename 3rd column as country code

    df.rename({'Unnamed: 3' : 'country_code' }, axis = 'columns' , inplace = True)
   
    #drop sort order, notes and type of data columns
    df.drop(columns=['Unnamed: 0' , 'Unnamed: 1' , 'Unnamed: 2'] , inplace = True)
    
    
    #condition because table 2 does not have type of data so we need to skip for table 2
    if(key!=dict_['Table 2']):
        df.drop(columns=['Unnamed: 4'] , inplace = True)
        
    #function call to rename rest of the columns
    df = rename_columns(df)
    
    #drop top 9 rows till sub-saharan africa as it is just summary data which can be easily
    #derived through simple code in python
    df.drop(df.index[0:7] , inplace = True)
    
    
    
    
    #use melt function to change orientation of data
    #two new columns will be formed tmp will contain all the column headers
    #key value that we stored in dict_ will form the other columns
   
    df_v1 = pd.melt(df, id_vars = 'country_code', value_vars = list(df.columns[1:len(df.columns)]) , var_name = "tmp" , value_name = key)
    

    #extract year from tmp column
    df_v1['year']  = df_v1.tmp.str.extract(pat = '([0-9-]+)')
    
    #extract sex from tmp column
    df_v1['sex']  = df_v1.tmp.str.extract(pat = '(?i)(both|male|female)')
    
    #drop tmp as useful info has been obtained
    df_v1.drop(columns = "tmp" , inplace = True)
    df_v1.reset_index()
    return df_v1


In [429]:
#main table
df2 = pd.DataFrame()

#rate of change dataframe
df3 = pd.DataFrame()
lst_ = []

for key in dict_:
    df = pd.read_excel(io="UN_MigrantStockTotal_2015.xlsx", sheet_name= key , index_col=False)
    
    if(key == 'Table 6'):
        tmp = clean_table(df[list(df.columns[0:11])].copy(), dict_[key][0])
        df1 = tmp
        tmp = clean_table(df[list(df.columns[[0,1,2,3,4,12,13,14,15,16,17]])].copy() , dict_[key][1])
        tmp = clean_table(df[list(df.columns[[0,1,2,3,4,18,19,20,21]])].copy() , dict_[key][2])
    else:
        df1 = clean_table(df , dict_[key])
    
    #list of all tables
    lst_.append(df1)
    
    #join tables
    if (key!='Table 5'):
        if(df2.empty):
            df2 = df1
        else:
            df2 = pd.concat([df2 , df1.drop(columns=['country_code' , 'year' , 'sex'])], axis=1, join="inner")
    else:
        if(df3.empty): 
            df3 = df1
        else:
            df3 = pd.concat([df3 , tmp.drop(columns=['country_code' , 'year' , 'sex'])], axis=1, join="inner")
        
#print(display(df2['sex']))
#print(display(lst_[5]["year"].unique()))

print(display(df2))


Unnamed: 0,country_code,IMS,year,sex,total_population,IMS/total_population_%age,Female/IMS_%age,RMS
0,947,14690319,1990,both,491497.691,2.988889,47.276121,5516042
1,903,15690623,1990,both,631614.304,2.48421,47.232408,5687352
2,910,5964031,1990,both,198231.687,3.008616,48.504812,3168001
3,108,333110,1990,both,5613.141,5.934467,50.987061,267929
4,174,14079,1990,both,415.144,3.391353,52.290646,0
...,...,...,...,...,...,...,...,...
1555,882,4929.0,2015,both,193.228,2.550873,49.908704,0.0
1556,772,487.0,2015,both,1.25,38.96,52.156057,0.0
1557,776,5731.0,2015,both,106.17,5.397947,45.437096,0.0
1558,798,141.0,2015,both,9.916,1.421944,44.680851,0.0


None


In [None]:
#Read notes
notes = pd.read_excel(io="UN_MigrantStockTotal_2015.xlsx", sheet_name= "NOTES" , index_col=False)
notes.drop(notes.index[0:14] , inplace = True)
notes.rename(columns={"Unnamed: 0": "code", "Unnamed: 1": "notes"} , inplace = True)
notes.set_index('code', inplace=True)
#print(display(notes))

In [291]:
#for each in lst_: 
    #print(display(each))

In [None]:
else:  
    #Table 6 only year is a value which needs to be converted into column
    #first col header is not a value, it is a variable unlike other tables
        df_v1 = pd.melt(df, id_vars = list(df.columns[[0,7,8,9,10,11,12,13,14,15,16,17]]), value_vars = list(df.columns[1:6]) , var_name = "tmp1" , value_name = dict_[key][0])
        df_v1 = pd.melt(df_v1, id_vars = list(df_v1.columns[[0,7,8,9,10,11,12,13]]) , value_vars = list(df_v1.columns[1:6]) , var_name = "tmp" , value_name = dict_[key][1])
        df_v1 = pd.melt(df_v1, id_vars =  ['country_code' , 'tmp1', dict_[key][0], dict_[key][1]], value_vars = list(df_v1.columns[1:5]) , var_name = "tmp" , value_name = dict_[key][2])