# LOADING AND CROSSING DATA PROCESSES

## Libraries utilized

In [2]:
#libraries utilized by loading and crossing data process
import numpy as np
import pandas as pd
from pandasgui import show
#libraries required for the crossing information process
import traceback
from threading import Thread
import threading
        

# The Objective is to merge and transform the Age ranges files into columns of the main dataframe, by joining both files by keys ( location and year )
- WPP2022_Demographic_Indicators_Medium.csv
- /WPP2022_PopulationExposureByAge5GroupSex_Medium.csv

### Read United Nations main attributes ( 65 variables ) csv

In [3]:
df  = pd.read_csv('datasets/WPP2022_Demographic_Indicators_Medium.csv',  sep=",",low_memory=False)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43472 entries, 0 to 43471
Data columns (total 67 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SortOrder               43472 non-null  int64  
 1   LocID                   43472 non-null  int64  
 2   Notes                   11552 non-null  object 
 3   ISO3_code               36024 non-null  object 
 4   ISO2_code               35872 non-null  object 
 5   SDMX_code               42864 non-null  float64
 6   LocTypeID               43472 non-null  int64  
 7   LocTypeName             43472 non-null  object 
 8   ParentID                43472 non-null  int64  
 9   Location                43472 non-null  object 
 10  VarID                   43472 non-null  int64  
 11  Variant                 43472 non-null  object 
 12  Time                    43472 non-null  int64  
 13  TPopulation1Jan         43472 non-null  float64
 14  TPopulation1July        43186 non-null

### Read United Nations statistic by range of ages of 5 years 

In [4]:
dfAgeGroups  = pd.read_csv('datasets/WPP2022_PopulationExposureByAge5GroupSex_Medium.csv',  sep=",",low_memory=False)

dfAgeGroups.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 903735 entries, 0 to 903734
Data columns (total 20 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   SortOrder    903735 non-null  int64  
 1   LocID        903735 non-null  int64  
 2   Notes        237825 non-null  object 
 3   ISO3_code    748356 non-null  object 
 4   ISO2_code    745185 non-null  object 
 5   SDMX_code    891051 non-null  float64
 6   LocTypeID    903735 non-null  int64  
 7   LocTypeName  903735 non-null  object 
 8   ParentID     903735 non-null  int64  
 9   Location     903735 non-null  object 
 10  VarID        903735 non-null  int64  
 11  Variant      903735 non-null  object 
 12  Time         903735 non-null  int64  
 13  MidPeriod    903735 non-null  float64
 14  AgeGrp       903735 non-null  object 
 15  AgeGrpStart  903735 non-null  int64  
 16  AgeGrpSpan   903735 non-null  int64  
 17  PopMale      903735 non-null  float64
 18  PopFemale    903735 non-

### To speed up the process, it was necessary to set indexes on column "Year" and "LocationID"

In [5]:
# Set multi-column index on 'Time' and 'LocID' in df
df.set_index(['Time', 'LocID'], inplace=True)

# Set multi-column index on 'Time' and 'LocID' in dfAgeGroups
dfAgeGroups.set_index(['Time', 'LocID'], inplace=True)

## Crossing information for adding columns, extracted from rows ( Transposition )

### We updated the range of ages in columns for each location and year. This process of manipulating data is done using a thread that will update column by column. The process is a loop that has used try and catch and column mapping. The process is driven by a json with all the possible options and columns


In [6]:
# Create a dictionary to map age group ranges to column names
column_mapping = {
    '0-4': ['00_04_male', '00_04_female'],
    '5-9': ['05_09_male', '05_09_female'],
    '10-14': ['10_14_male', '10_14_female'],
    '15-19': ['15_19_male', '15_19_female'],
    '20-24': ['20_24_male', '20_24_female'],
    '25-29': ['25_29_male', '25_29_female'],
    '30-34': ['30_34_male', '30_34_female'],
    '35-39': ['35_39_male', '35_39_female'],
    '40-44': ['40_44_male', '40_44_female'],
    '45-49': ['45_49_male', '45_49_female'],
    '50-54': ['50_54_male', '50_54_female'],
    '55-59': ['55_59_male', '55_59_female'],
    '60-64': ['60_64_male', '60_64_female'],
    '65-69': ['65_69_male', '65_69_female'],
    '70-74': ['70_74_male', '70_74_female'],
    '75-79': ['75_79_male', '75_79_female'],
    '80-84': ['80_84_male', '80_84_female'],
    '85-89': ['85_89_male', '85_89_female'],
    '90-94': ['90_94_male', '90_94_female'],
    '95-99': ['95_99_male', '95_99_female'],
    '100+': ['100_male', '100_female']
}

# Shared event flag
process_completed = threading.Event()

# Function to handle the main process
def process_data():
    try:
        # Loop over the dataframe rows and access columns
        for index, row in dfAgeGroups.iterrows():
            
            time_value = index[0]
            locid_value = index[1]
            
            matching_index = df.loc[(df.index.get_level_values('Time') == time_value) & (df.index.get_level_values('LocID') == locid_value)].index
            if not matching_index.empty:
                age_group = row['AgeGrp']
                if age_group in column_mapping:
                    columns_to_update = column_mapping[age_group]                    
                    df.loc[matching_index, columns_to_update] = [row['PopMale'],row['PopFemale']]
    except Exception as e:
        traceback.print_exc()
        
    process_completed.set()    


# Start the process in a separate thread
process_thread = Thread(target=process_data)

# Start the threads
process_thread.start()
# loading_thread.start()

# Wait for the process thread to finish
process_thread.join()



# Print a completion message
print('\nProcess completed.')




Process completed.


#### Commented lines used to extract the process dataframe

In [None]:
#df.to_csv('datasets/data_processed_ranges.csv')

In [None]:
# df.to_hdf('data_processed.h5', key='df')

In [None]:
# import pickle

# # Save DataFrame using Pickle
# with open('data.pkl', 'wb') as f:
#     pickle.dump(df, f)


In [None]:
# from pandasgui import show
# dfAgeGroups.head()
# show( dfAgeGroups)

In [None]:
show( df )