In [0]:
from pyspark.sql.functions import col,lit,explode, sum, round

# World population

## Read CSV

In [0]:
volume_csv = '/Volumes/cat_covid/bronze/raw/unzip/API_SP.POP.TOTL_DS2_en_csv_v2_85220.csv'

df = (spark.read.format('csv')
      .option('skipRows',4)
      .option('header',True)
      .load(volume_csv))

display(df)

## Unpivot df

In [0]:
df = df.drop('Indicator Name', 'Indicator Code').withColumnsRenamed({'Country Name':'Country_Name','Country Code':'Country_Code'})

years = df.columns[2:]

#unpivot
df = (df.melt(ids=['Country_Name', 'Country_Code'],values=years,variableColumnName='Year', valueColumnName='Population'))


In [0]:
df = df.withColumn('Year', df.Year.cast('int')).withColumn('Population', round(df.Population.cast('double'),0).cast('bigint'))

# Caribbean Netherlands

## Read all json files

In [0]:
json_path = 'abfss://bronze@storagegeneral00001.dfs.core.windows.net/volumes/raw_population/netherlands/*.json'

df_cn_ = spark.read.format('json').option('multiline',True).load(json_path)

df_cn_.display()

In [0]:
df_cn = (df_cn_
         .select('value')
         .withColumn('value', explode('value'))
         .select(
             col('value.Periods').alias('Year')
             ,col('value.populationonjanuary1st_1').cast('int').alias('Population')
             ,col('value.Sex').alias('Sex')
             ,col('value.Age').alias('Age')
             ,col('value.MaritalStatus').alias('MaritalStatus'))
         .withColumn('Year', col('Year').substr(0,4).cast('int'))
         )

df_cn = df_cn.filter((col('Sex')=='T001038') & (col('Age')=='10000') & (col('MaritalStatus')=='T001019'))

display(df_cn)

In [0]:
df_cn =( df_cn.groupBy('Year').agg(sum('Population').alias('Population'))
        .withColumn('Country_Name',lit('Bonaire, Sint Eustatius and Saba'))
        .withColumn('Country_Code', lit('BQ'))
        ).select('Country_Name', 'Country_Code', 'Year', 'Population')

display(df_cn)         

# Union tables

In [0]:
try:
    df = df.union(df_cn)
except:
    pass

In [0]:
df = df.select(col('Country_Name').cast('string')
               ,col('Country_Code').cast('string')
               ,col('Year').cast('int')
               ,col('Population').cast('bigint'))

df.write.mode('overwrite').saveAsTable('cat_covid.silver.population')