In [6]:
import os
import requests
import pandas as pd

def download_file(url, folder="data"):
    """Download a file to the given folder if not already present."""
    
    # Ensure the data folder exists
    os.makedirs(folder, exist_ok=True)

    # Extract filename and construct full local path
    filename = url.split("/")[-1]
    local_path = os.path.join(folder, filename)

    if os.path.exists(local_path):
        print(f"Using cached file: {local_path}")
        return local_path

    # Download the file
    print(f"Downloading: {url}")
    response = requests.get(url, stream=True)
    response.raise_for_status()

    with open(local_path, 'wb') as f:
        for chunk in response.iter_content(chunk_size=8192):
            f.write(chunk)

    print(f"Saved to: {local_path}")
    return local_path

def open_gz_with_pandas(file_path):
    """Open a gzip-compressed CSV file with pandas."""
    return pd.read_csv(file_path, compression='gzip')

# --- Usage ---

population_data_url = "https://population.un.org/wpp/assets/Excel%20Files/1_Indicator%20(Standard)/CSV_FILES/WPP2024_Demographic_Indicators_Medium.csv.gz"

file_path = download_file(population_data_url)
dataframe = open_gz_with_pandas(file_path)

# Preview the data
print(dataframe.head())


Using cached file: data\WPP2024_Demographic_Indicators_Medium.csv.gz
   SortOrder  LocID Notes ISO3_code ISO2_code  SDMX_code  LocTypeID  \
0        NaN   5507   NaN       NaN       NaN        NaN        NaN   
1        NaN   5507   NaN       NaN       NaN        NaN        NaN   
2        NaN   5507   NaN       NaN       NaN        NaN        NaN   
3        NaN   5507   NaN       NaN       NaN        NaN        NaN   
4        NaN   5507   NaN       NaN       NaN        NaN        NaN   

  LocTypeName  ParentID                           Location  ...  Q0060Male  \
0         NaN       NaN  ADB region: Central and West Asia  ...   654.5798   
1         NaN       NaN  ADB region: Central and West Asia  ...   642.5944   
2         NaN       NaN  ADB region: Central and West Asia  ...   630.8166   
3         NaN       NaN  ADB region: Central and West Asia  ...   619.9937   
4         NaN       NaN  ADB region: Central and West Asia  ...   610.3878   

  Q0060Female     Q1550  Q1550Male 

  return pd.read_csv(file_path, compression='gzip')


In [8]:
list(dataframe)

['SortOrder',
 'LocID',
 'Notes',
 'ISO3_code',
 'ISO2_code',
 'SDMX_code',
 'LocTypeID',
 'LocTypeName',
 'ParentID',
 'Location',
 'VarID',
 'Variant',
 'Time',
 'TPopulation1Jan',
 'TPopulation1July',
 'TPopulationMale1July',
 'TPopulationFemale1July',
 'PopDensity',
 'PopSexRatio',
 'MedianAgePop',
 'NatChange',
 'NatChangeRT',
 'PopChange',
 'PopGrowthRate',
 'DoublingTime',
 'Births',
 'Births1519',
 'CBR',
 'TFR',
 'NRR',
 'MAC',
 'SRB',
 'Deaths',
 'DeathsMale',
 'DeathsFemale',
 'CDR',
 'LEx',
 'LExMale',
 'LExFemale',
 'LE15',
 'LE15Male',
 'LE15Female',
 'LE65',
 'LE65Male',
 'LE65Female',
 'LE80',
 'LE80Male',
 'LE80Female',
 'InfantDeaths',
 'IMR',
 'LBsurvivingAge1',
 'Under5Deaths',
 'Q5',
 'Q0040',
 'Q0040Male',
 'Q0040Female',
 'Q0060',
 'Q0060Male',
 'Q0060Female',
 'Q1550',
 'Q1550Male',
 'Q1550Female',
 'Q1560',
 'Q1560Male',
 'Q1560Female',
 'NetMigrations',
 'CNMR']

In [9]:
filtered_data = dataframe[dataframe['LocTypeID'] == 4][['Location', 'TPopulation1Jan', 'Time']]
filtered_data

Unnamed: 0,Location,TPopulation1Jan,Time
44384,Burundi,2229.322,1950
44385,Burundi,2280.554,1951
44386,Burundi,2330.938,1952
44387,Burundi,2380.670,1953
44388,Burundi,2429.703,1954
...,...,...,...
84355,Wallis and Futuna Islands,7.161,2097
84356,Wallis and Futuna Islands,7.084,2098
84357,Wallis and Futuna Islands,7.002,2099
84358,Wallis and Futuna Islands,6.915,2100


In [None]:
filtered_data.to_csv("data/population_country_data.csv")