In [1]:
import numpy as np
import pandas as pd

In [2]:
file_path = "98-401-X2021001_English_CSV_data.csv"

# Read the CSV file with 'utf-8' encoding and handle errors with 'latin-1'
try:
    df = pd.read_csv(file_path, encoding='utf-8')
except UnicodeDecodeError:
    df = pd.read_csv(file_path, encoding='latin-1')

In [3]:
columns_to_keep = ['GEO_LEVEL', 'CHARACTERISTIC_NAME','C1_COUNT_TOTAL', 'C2_COUNT_MEN+', 'C3_COUNT_WOMEN+']

In [4]:
df =df[columns_to_keep]

In [5]:
df = df[df['GEO_LEVEL']=='Country']

In [6]:
df.head(25)

Unnamed: 0,GEO_LEVEL,CHARACTERISTIC_NAME,C1_COUNT_TOTAL,C2_COUNT_MEN+,C3_COUNT_WOMEN+
0,Country,"Population, 2021",36991981.0,,
1,Country,"Population, 2016",35151728.0,,
2,Country,"Population percentage change, 2016 to 2021",5.2,,
3,Country,Total private dwellings,16284235.0,,
4,Country,Private dwellings occupied by usual residents,14978941.0,,
5,Country,Population density per square kilometre,4.2,,
6,Country,Land area in square kilometres,8788702.8,,
7,Country,Total - Age groups of the population - 100% data,36991980.0,18226240.0,18765740.0
8,Country,0 to 14 years,6012795.0,3086510.0,2926285.0
9,Country,0 to 4 years,1831195.0,938790.0,892405.0


In [7]:
df['Level'] = (df['CHARACTERISTIC_NAME'].apply(len)-df['CHARACTERISTIC_NAME'].str.strip().apply(len))/2

In [8]:
 max_level = int(df['Level'].max())

In [9]:
 level_val=df['Level'].unique()

In [10]:
level_list =[]

In [11]:
level_list = ["Lvl-" + str(i) for i in range(0, max_level + 1)]

In [12]:
level_list

['Lvl-0',
 'Lvl-1',
 'Lvl-2',
 'Lvl-3',
 'Lvl-4',
 'Lvl-5',
 'Lvl-6',
 'Lvl-7',
 'Lvl-8']

In [13]:
for column_name in level_list:
        df[column_name] = column_name

In [14]:
for column_name in level_list:
        df[column_name] = np.nan

In [15]:
df

Unnamed: 0,GEO_LEVEL,CHARACTERISTIC_NAME,C1_COUNT_TOTAL,C2_COUNT_MEN+,C3_COUNT_WOMEN+,Level,Lvl-0,Lvl-1,Lvl-2,Lvl-3,Lvl-4,Lvl-5,Lvl-6,Lvl-7,Lvl-8
0,Country,"Population, 2021",36991981.0,,,0.0,,,,,,,,,
1,Country,"Population, 2016",35151728.0,,,0.0,,,,,,,,,
2,Country,"Population percentage change, 2016 to 2021",5.2,,,0.0,,,,,,,,,
3,Country,Total private dwellings,16284235.0,,,0.0,,,,,,,,,
4,Country,Private dwellings occupied by usual residents,14978941.0,,,0.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2626,Country,Total - Eligibility and instruction in the min...,5372330.0,2761965.0,2610365.0,0.0,,,,,,,,,
2627,Country,Children eligible for instruction in the min...,682190.0,346420.0,335770.0,1.0,,,,,,,,,
2628,Country,Eligible children who have been instructed...,467685.0,234475.0,233215.0,2.0,,,,,,,,,
2629,Country,Eligible children who have not been instru...,214500.0,111950.0,102550.0,2.0,,,,,,,,,


In [16]:
df_play = df.copy()

In [17]:

# Iterate through the DataFrame rows
for index, row in df_play.iterrows():

    for level_column in level_list:
            if row['Level'] == int(level_column.split('-')[1]):
                characteristic_name = row['CHARACTERISTIC_NAME'].strip()
                df_play.at[index, level_column] = characteristic_name
                


In [18]:
df_play.head(50)

Unnamed: 0,GEO_LEVEL,CHARACTERISTIC_NAME,C1_COUNT_TOTAL,C2_COUNT_MEN+,C3_COUNT_WOMEN+,Level,Lvl-0,Lvl-1,Lvl-2,Lvl-3,Lvl-4,Lvl-5,Lvl-6,Lvl-7,Lvl-8
0,Country,"Population, 2021",36991981.0,,,0.0,"Population, 2021",,,,,,,,
1,Country,"Population, 2016",35151728.0,,,0.0,"Population, 2016",,,,,,,,
2,Country,"Population percentage change, 2016 to 2021",5.2,,,0.0,"Population percentage change, 2016 to 2021",,,,,,,,
3,Country,Total private dwellings,16284235.0,,,0.0,Total private dwellings,,,,,,,,
4,Country,Private dwellings occupied by usual residents,14978941.0,,,0.0,Private dwellings occupied by usual residents,,,,,,,,
5,Country,Population density per square kilometre,4.2,,,0.0,Population density per square kilometre,,,,,,,,
6,Country,Land area in square kilometres,8788702.8,,,0.0,Land area in square kilometres,,,,,,,,
7,Country,Total - Age groups of the population - 100% data,36991980.0,18226240.0,18765740.0,0.0,Total - Age groups of the population - 100% data,,,,,,,,
8,Country,0 to 14 years,6012795.0,3086510.0,2926285.0,1.0,,0 to 14 years,,,,,,,
9,Country,0 to 4 years,1831195.0,938790.0,892405.0,2.0,,,0 to 4 years,,,,,,


In [19]:
# iterate each level column and update the NaN ones based on the hierarchy
level_val=df['Level'].unique()
for i in level_val:
    i = int(i)
    last_characteristic_name = ''
    # Iterate through the DataFrame rows
    for index, row in df_play.iterrows():
        if pd.notna(row[level_list[i]]):
            last_characteristic_name = row['CHARACTERISTIC_NAME'].strip()
        elif row['Level'] >= i:
            # Update the 'Lvl-x' column with the last_characteristic_name until the next 0 occurs
            df_play.at[index, level_list[i]] = last_characteristic_name

In [20]:
df_play.head(50)

Unnamed: 0,GEO_LEVEL,CHARACTERISTIC_NAME,C1_COUNT_TOTAL,C2_COUNT_MEN+,C3_COUNT_WOMEN+,Level,Lvl-0,Lvl-1,Lvl-2,Lvl-3,Lvl-4,Lvl-5,Lvl-6,Lvl-7,Lvl-8
0,Country,"Population, 2021",36991981.0,,,0.0,"Population, 2021",,,,,,,,
1,Country,"Population, 2016",35151728.0,,,0.0,"Population, 2016",,,,,,,,
2,Country,"Population percentage change, 2016 to 2021",5.2,,,0.0,"Population percentage change, 2016 to 2021",,,,,,,,
3,Country,Total private dwellings,16284235.0,,,0.0,Total private dwellings,,,,,,,,
4,Country,Private dwellings occupied by usual residents,14978941.0,,,0.0,Private dwellings occupied by usual residents,,,,,,,,
5,Country,Population density per square kilometre,4.2,,,0.0,Population density per square kilometre,,,,,,,,
6,Country,Land area in square kilometres,8788702.8,,,0.0,Land area in square kilometres,,,,,,,,
7,Country,Total - Age groups of the population - 100% data,36991980.0,18226240.0,18765740.0,0.0,Total - Age groups of the population - 100% data,,,,,,,,
8,Country,0 to 14 years,6012795.0,3086510.0,2926285.0,1.0,Total - Age groups of the population - 100% data,0 to 14 years,,,,,,,
9,Country,0 to 4 years,1831195.0,938790.0,892405.0,2.0,Total - Age groups of the population - 100% data,0 to 14 years,0 to 4 years,,,,,,


In [21]:
df_play.to_csv("census_data_with_levels.csv", index=False)


In [22]:
print("Export Done")

Export Done
