In [25]:
import plotly.express as px
import plotly.graph_objects as go

from pyspark.sql import SparkSession
from v_time import timeit

import constants as c

spark = SparkSession.builder.appName("weather").getOrCreate()

# 1. Load data

In [2]:
sdf = timeit(spark.read.csv)(c.URI_CSV, header=True)

timeit: 'csv' in 3.68 s


<div class="alert alert-info" role="alert">
    Spark is lazy. It won't do anything until you ask for something that will imply some action to the data. So read a csv it will only means read the schema
</div>

# 2. Explore data

In [3]:
sdf.show()

+--------------+------------+-------------+--------------------+--------------------+-------------+----------+---------+
|            ID|CODI_ESTACIO|CODI_VARIABLE|        DATA_LECTURA|         DATA_EXTREM|VALOR_LECTURA|CODI_ESTAT|CODI_BASE|
+--------------+------------+-------------+--------------------+--------------------+-------------+----------+---------+
|WX422901150200|          WX|           42|29/01/2015 02:00:...|29/01/2015 02:00:...|          9.5|         V|       SH|
|WX442901150200|          WX|           44|29/01/2015 02:00:...|29/01/2015 02:00:...|           49|         V|       SH|
|WX462901150200|          WX|           46|29/01/2015 02:00:...|                null|          4.4|         V|       SH|
|WX472901150200|          WX|           47|29/01/2015 02:00:...|                null|          258|         V|       SH|
|WX562901150200|          WX|           56|29/01/2015 02:00:...|29/01/2015 02:00:...|          8.6|         V|       SH|
|WX572901150200|          WX|   

In [4]:
sdf.limit(5).toPandas()

Unnamed: 0,ID,CODI_ESTACIO,CODI_VARIABLE,DATA_LECTURA,DATA_EXTREM,VALOR_LECTURA,CODI_ESTAT,CODI_BASE
0,WX422901150200,WX,42,29/01/2015 02:00:00 PM,29/01/2015 02:00:00 PM,9.5,V,SH
1,WX442901150200,WX,44,29/01/2015 02:00:00 PM,29/01/2015 02:00:00 PM,49.0,V,SH
2,WX462901150200,WX,46,29/01/2015 02:00:00 PM,,4.4,V,SH
3,WX472901150200,WX,47,29/01/2015 02:00:00 PM,,258.0,V,SH
4,WX562901150200,WX,56,29/01/2015 02:00:00 PM,29/01/2015 02:00:00 PM,8.6,V,SH


In [5]:
# timeit(sdf.select("CODI_ESTACIO").distinct().count)()

# Using parquets
Since it is reading from a csv it is not really eficient. Let's transform to a good file format.

In [6]:
timeit(sdf.write.format("parquet").mode("overwrite").save)(c.URI_DELTA)

timeit: 'save' in 17.47 min


After exporting it will be better if we read it using this better format

In [7]:
sdf = spark.read.format("parquet").load(c.URI_DELTA)

In [8]:
timeit(sdf.select("CODI_ESTACIO").distinct().count)()

timeit: 'count' in 39.15 s


212

In [31]:
df = sdf.groupby("CODI_ESTACIO").count().toPandas().sort_values("count")
print(f"There are {df.shape[0]} unique values")
df.head()

There are 212 unique values


Unnamed: 0,CODI_ESTACIO,count
125,U8,8302
191,YL,14047
150,YK,43971
95,V9,78980
173,KE,115589


In [32]:
px.bar(df, x='CODI_ESTACIO', y='count').show()

In [34]:
df = sdf.groupby("CODI_VARIABLE").count().toPandas().sort_values("count")
print(f"There are {df.shape[0]} unique values")

There are 26 unique values


In [35]:
# Make it a string in plotly
df["CODI_VARIABLE"] = "C" + df["CODI_VARIABLE"].apply(str)

In [36]:
px.bar(df, x='CODI_VARIABLE', y='count').show()

In [37]:
df = sdf.groupby("CODI_ESTACIO", "CODI_VARIABLE").count().toPandas().sort_values("count")

In [38]:
df["name"] = df["CODI_ESTACIO"] + df["CODI_VARIABLE"]

In [39]:
px.bar(df, x='name', y='count').show()

In [40]:
timeit(sdf
    .write
    .format("parquet")
    .mode("overwrite")
    .partitionBy(["CODI_ESTACIO", "CODI_VARIABLE"])
    .save
)(c.URI_DELTA_PART)

timeit: 'save' in 1.01 h


In [None]:
timeit(sdf
    .write
    .format("parquet")
    .mode("overwrite")
    .partitionBy("CODI_ESTACIO")
    .save
)(f"{c.PATH_DATA}weather_data_partition2.delta")