In [1]:
from pathlib import Path
import pandas as pd
from datetime import timedelta
import argparse
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql import types
from pyspark.conf import SparkConf
from pyspark.context import SparkContext

In [2]:
credentials_location = '/Users/admin/Downloads/swift-arcadia-387709-b04513fcbebe.json'

conf = SparkConf() \
    .setMaster('local[*]') \
    .setAppName('Monkey D.Luffy') \
    .set("spark.jars", "/Users/admin/Public/project/noaa-climate-datasets/lib/gcs-connector-hadoop3-latest.jar") \
    .set("spark.hadoop.google.cloud.auth.service.account.enable", "true") \
    .set("spark.hadoop.google.cloud.auth.service.account.json.keyfile", credentials_location)

sc = SparkContext(conf=conf)

hadoop_conf = sc._jsc.hadoopConfiguration()

hadoop_conf.set("fs.AbstractFileSystem.gs.impl",  "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")
hadoop_conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
hadoop_conf.set("fs.gs.auth.service.account.json.keyfile", credentials_location)
hadoop_conf.set("fs.gs.auth.service.account.enable", "true")

23/08/16 11:29:23 WARN Utils: Your hostname, ThienLes-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 172.16.20.61 instead (on interface en0)
23/08/16 11:29:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/08/16 11:29:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


In [3]:
spark = SparkSession.builder \
    .appName('Monkey D.Luffy') \
    .getOrCreate()

In [34]:
df_VE = spark.read.parquet("gs://noaa_ghcn_data_lake_swift-arcadia-387709/data/pq/climate/2022/countryCode=VE/*")

In [43]:
df_VE = df_VE.withColumn('date', F.to_date('DateStr', "yyyyMMdd"))

In [44]:
df_VE.take(5)

                                                                                

[Row(StationId='VE000002417', DateStr='20220101', Element='TAVG', Value=227.0, M-Flag='H', Q-Flag=None, S-Flag='S', Obs-Time=None, date=datetime.date(2022, 1, 1)),
 Row(StationId='VE000002417', DateStr='20220515', Element='TAVG', Value=264.0, M-Flag='H', Q-Flag=None, S-Flag='S', Obs-Time=None, date=datetime.date(2022, 5, 15)),
 Row(StationId='VE000005484', DateStr='20220101', Element='TMIN', Value=234.0, M-Flag=None, Q-Flag=None, S-Flag='S', Obs-Time=None, date=datetime.date(2022, 1, 1)),
 Row(StationId='VE000005484', DateStr='20220515', Element='TMIN', Value=248.0, M-Flag=None, Q-Flag=None, S-Flag='S', Obs-Time=None, date=datetime.date(2022, 5, 15)),
 Row(StationId='VE000005484', DateStr='20220101', Element='PRCP', Value=0.0, M-Flag=None, Q-Flag=None, S-Flag='S', Obs-Time=None, date=datetime.date(2022, 1, 1))]

In [21]:
df_VE.createOrReplaceTempView("VE")

In [22]:
df = spark.sql("""
select Element, count(*) from VE
group by Element
""")

In [25]:
df.take(20)

                                                                                

[Row(Element='TMIN', count(1)=228),
 Row(Element='TMAX', count(1)=15),
 Row(Element='TAVG', count(1)=3868),
 Row(Element='PRCP', count(1)=241)]

In [26]:
df_all = spark.read.parquet("gs://noaa_ghcn_data_lake_swift-arcadia-387709/data/pq/climate/2022/*")

                                                                                

In [27]:
df_all = df_all.withColumn('countryCode', F.substring("StationId", 0, 2))

In [28]:
df_all.createOrReplaceTempView("noaa_all")

In [31]:
count_data = spark.sql("""
select Element, countryCode, count(*) from noaa_all as num_records
group by Element, countryCode
order by 3 desc
""")

In [33]:
count_data.take(100)

                                                                                

[Row(Element='PRCP', countryCode='US', count(1)=7789214),
 Row(Element='SNOW', countryCode='US', count(1)=5199590),
 Row(Element='TMAX', countryCode='US', count(1)=2656927),
 Row(Element='TMIN', countryCode='US', count(1)=2653396),
 Row(Element='SNWD', countryCode='US', count(1)=2461821),
 Row(Element='TOBS', countryCode='US', count(1)=1616423),
 Row(Element='PRCP', countryCode='AS', count(1)=1391158),
 Row(Element='TAVG', countryCode='US', count(1)=833241),
 Row(Element='WESD', countryCode='US', count(1)=528067),
 Row(Element='PRCP', countryCode='CA', count(1)=484711),
 Row(Element='AWND', countryCode='US', count(1)=418447),
 Row(Element='WSF2', countryCode='US', count(1)=394645),
 Row(Element='WDF2', countryCode='US', count(1)=394547),
 Row(Element='WSF5', countryCode='US', count(1)=382282),
 Row(Element='WDF5', countryCode='US', count(1)=381795),
 Row(Element='TMAX', countryCode='CA', count(1)=335544),
 Row(Element='TMIN', countryCode='CA', count(1)=335122),
 Row(Element='WESF', cou