In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col
import pyspark.sql.functions as F
from pyspark.sql.functions import sum, mean

spark = SparkSession.builder.getOrCreate()
spark.conf.set('spark.sql.execution.arrow.pyspark.enabled', True)
spark.conf.set('spark.sql.repl.eagerEval.enabled', True)


In [9]:
# Make schema and create a spark data frame to read weather data in
columns = ["STATION","NAME","LATITUDE","LONGITUDE","ELEVATION","DATE",
    "PRCP","SNOW","SNWD","TAVG","TMAX","TMIN","TOBS"]
ints = ('TAVG', 'TMAX', 'TMIN', 'TOBS')
doubles = ('PRCP','SNOW', 'SNWD','LATITUDE', 'LONGITUDE', 'ELEVATION')
strings = ('STATION','NAME')
dtimes = ('DATE',)
dtypes = {column: IntegerType() for column in ints}
dtypes.update({column: DoubleType() for column in doubles})
dtypes.update({column: StringType() for column in strings})
dtypes.update({column: DateType() for column in dtimes})

schema = StructType()
for column in columns:
    schema.add(column, 
               dtypes[column],
               True )

wsdf = spark.read.csv('../raw_data/weather.csv', header=True)
wsdf.printSchema()

root
 |-- STATION: string (nullable = true)
 |-- NAME: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)
 |-- ELEVATION: string (nullable = true)
 |-- DATE: string (nullable = true)
 |-- PRCP: string (nullable = true)
 |-- SNOW: string (nullable = true)
 |-- SNWD: string (nullable = true)
 |-- TAVG: string (nullable = true)
 |-- TMAX: string (nullable = true)
 |-- TMIN: string (nullable = true)
 |-- TOBS: string (nullable = true)



In [10]:
# remove irrelevant attributes

wsdf = wsdf.withColumn("month", F.month("DATE")) \
    .withColumn("day", F.dayofmonth("DATE")) 
wsdf = wsdf.drop("DATE",'LATITUDE', 'LONGITUDE', 'ELEVATION','STATION','NAME')
# Handling missing data separately
data_TAVG = wsdf.na.drop(subset=["TAVG"])           
data_PRCP = wsdf.na.drop(subset=["PRCP"])           
data_TMAX = wsdf.na.drop(subset=["TMAX"])           
data_TMIN = wsdf.na.drop(subset=["TMIN"])           
data_SNOW = wsdf.na.drop(subset=["SNOW"])
data_SNWD = wsdf.na.drop(subset=["SNWD"])
# Aggregate the results
data_TAVG = data_TAVG.groupBy("month","day").agg(mean("TAVG").alias("TAVG")) \
                    .dropna() \
                    .sort(data_TAVG.month.asc(), data_TAVG.day.asc()) \
                    .toPandas()

data_TMIN = data_TMIN.groupBy("month","day").agg(mean("TMIN").alias("TMIN")) \
                    .sort(data_TMIN.month.asc(), data_TMIN.day.asc()) \
                    .toPandas()["TMIN"]
data_TMAX = data_TMAX.groupBy("month","day").agg(mean("TMAX").alias("TMAX")) \
                    .sort(data_TMAX.month.asc(), data_TMAX.day.asc()) \
                    .toPandas()["TMAX"]
data_PRCP = data_PRCP.groupBy("month","day").agg(mean("PRCP").alias("PRCP")) \
                    .sort(data_PRCP.month.asc(), data_PRCP.day.asc()) \
                    .toPandas()["PRCP"]
data_SNOW = data_SNOW.groupBy("month","day").agg(mean("SNOW").alias("SNOW")) \
                    .sort(data_SNOW.month.asc(), data_SNOW.day.asc()) \
                    .toPandas()["SNOW"]
data_SNWD = data_SNWD.groupBy("month","day").agg(mean("SNWD").alias("SNWD")) \
                    .sort(data_SNWD.month.asc(), data_SNWD.day.asc()) \
                    .toPandas()["SNWD"]

data = data_TAVG
data["TMIN"] = data_TMIN
data["TMAX"] = data_TMAX
data["PRCP"] = data_PRCP
data["SNOW"] = data_SNOW
data["SNWD"] = data_SNWD
data.tail()




Unnamed: 0,month,day,TAVG,TMIN,TMAX,PRCP,SNOW,SNWD
360,12,27,48.333333,37.538462,49.461538,0.000137,0.0,0.0
361,12,28,47.666667,36.928571,51.928571,0.000658,0.0,0.0
362,12,29,41.666667,31.714286,46.142857,0.047922,0.0,0.0
363,12,30,41.666667,35.357143,43.357143,0.672405,0.0,0.0
364,12,31,40.0,37.0,43.428571,0.395696,0.0,0.0


In [11]:
# save the results
data.to_csv('../data/weather_p.csv')