In [1]:
!sudo apt install -y openjdk-8-jre-headless
!pip install pyspark

Reading package lists... Done
Building dependency tree       
Reading state information... Done
openjdk-8-jre-headless is already the newest version (8u275-b01-0ubuntu1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 9 not upgraded.


In [2]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
spark

In [3]:
rdd = spark.sparkContext.textFile('/home/ubuntu/raw/*')
rdd.take(2)

['23583 20200101 0005 20191231 1505      3 -158.61   59.28   -17.5     0.0     30 0   -21.8 C 0    60 0 -99.000 -9999.0  1015 0   1.59 0',
 '23583 20200101 0010 20191231 1510      3 -158.61   59.28   -17.5     0.0     35 0   -22.1 C 0    60 0 -99.000 -9999.0  1015 0   1.35 0']

In [4]:
rdd.count()

188256

In [7]:
from datetime import datetime, timezone
from pyspark.sql import Row

def parse_line(line):
    f = line.split()
    wbanno = f[0]
    dt = datetime.strptime(f[1] + f[2], '%Y%m%d%H%M')
    dt = dt.replace(tzinfo=timezone.utc)
    temperature = None if f[8] == '-9999.0' else float(f[8])
    return Row(timestamp=dt, wbanno=wbanno, temperature=temperature)

rows = rdd.map(parse_line)
rows.take(2)

[Row(timestamp=datetime.datetime(2020, 1, 1, 0, 5, tzinfo=datetime.timezone.utc), wbanno='23583', temperature=-17.5),
 Row(timestamp=datetime.datetime(2020, 1, 1, 0, 10, tzinfo=datetime.timezone.utc), wbanno='23583', temperature=-17.5)]

In [8]:
spark.conf.set("spark.sql.session.timeZone", 'UTC')

In [9]:
df = rdd.map(parse_line).toDF()
df

DataFrame[timestamp: timestamp, wbanno: string, temperature: double]

In [10]:
df.show(2)

+-------------------+------+-----------+
|          timestamp|wbanno|temperature|
+-------------------+------+-----------+
|2020-01-01 00:05:00| 23583|      -17.5|
|2020-01-01 00:10:00| 23583|      -17.5|
+-------------------+------+-----------+
only showing top 2 rows



In [11]:
df.describe().show()

+-------+------------------+------------------+
|summary|            wbanno|       temperature|
+-------+------------------+------------------+
|  count|            188256|            188052|
|   mean|           25119.5|1.5413896156381783|
| stddev|1536.5040808954864| 11.66942920389126|
|    min|             23583|             -32.0|
|    max|             26656|              24.8|
+-------+------------------+------------------+



In [12]:
df.createOrReplaceTempView('uscrn')

In [13]:
query = '''
SELECT
  wbanno,
  min_by(timestamp, temperature) timestamp_min,
  min(temperature) t_min,
  max_by(timestamp, temperature) timestamp_max,
  max(temperature) t_max
FROM
  uscrn
GROUP by
  1
'''
spark.sql(query).show()

+------+-------------------+-----+-------------------+-----+
|wbanno|      timestamp_min|t_min|      timestamp_max|t_max|
+------+-------------------+-----+-------------------+-----+
| 23583|2020-02-01 16:15:00|-32.0|2020-08-17 00:20:00| 24.8|
| 26656|2020-02-09 15:15:00|-30.8|2020-05-30 23:05:00| 23.3|
+------+-------------------+-----+-------------------+-----+



In [14]:
df.write.save('./uscrn-parquet')

In [15]:
!ls ./uscrn-parquet

_SUCCESS
part-00000-5cb3cd43-c795-49ce-9d46-ead6520135d2-c000.snappy.parquet
part-00001-5cb3cd43-c795-49ce-9d46-ead6520135d2-c000.snappy.parquet


In [16]:
df = spark.read.load('./uscrn-parquet')
df.groupBy('wbanno').avg('temperature').show()

+------+------------------+
|wbanno|  avg(temperature)|
+------+------------------+
| 23583|2.5634793606257302|
| 26656|0.5178639846742709|
+------+------------------+



In [19]:
# Spark Dataframe

from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.session.timeZone", 'UTC')

df = spark.read.load('./uscrn-parquet')
df1 = df.groupBy('timestamp').avg().toPandas()
df1.sort_values(by='avg(temperature)', ascending=False).head(2)

Unnamed: 0,timestamp,avg(temperature)
42528,2020-07-17 00:50:00,22.9
7317,2020-08-17 02:55:00,22.7


In [17]:
# Pandas Dataframe

In [18]:
!pip install pyarrow



In [19]:
import pandas as pd
df = pd.read_parquet('./uscrn-parquet')

df1 = df.groupby('timestamp').mean()
df1.sort_values(by='temperature', ascending=False).head(2)

Unnamed: 0_level_0,temperature
timestamp,Unnamed: 1_level_1
2020-07-17 00:50:00,22.9
2020-08-17 02:55:00,22.7


In [23]:
# csv

In [20]:
from pyspark.sql.session import SparkSession
spark = SparkSession.builder.getOrCreate()
spark.conf.set("spark.sql.session.timeZone", 'UTC')

df = spark.read.load('./uscrn-parquet')
df1 = df.where("timestamp >= '2020-01-01' AND timestamp < '2020-04-01'")
df1.count()

52414

In [21]:
df1.coalesce(1).write.save('./export', format='csv', header=True)

In [22]:
!ls ./export

_SUCCESS  part-00000-b31b685e-f733-4007-b11c-b8a9ea6b2ee8-c000.csv


In [23]:
!cp ./export/*.csv ~/Home/Desktop/