### Notebook for preparing/gathering the weather datasets

**Original Author:** Ema Vargova.<br/>
**Additional Info:** Working with Open Weather Map API - One Call API for historical data. https://openweathermap.org/api/one-call-api#history<br/>
**Last Modified:** By Ema Vargova on the 03.07.2021

In [2]:
# Import the findspark module 
import findspark

# Initialize via the full spark path
findspark.init("/usr/local/spark/")

# Import the SparkSession module
from pyspark.sql import SparkSession

# Import the collections module
import collections

from pyspark import SparkContext
from pyspark.sql import functions as F
from urllib.request import Request, urlopen
from datetime import date, timezone, datetime, timedelta

# Gets an existing :class:`SparkSession` or, if there is no existing one, creates a
# new one based on the options set in this builder.
spark = SparkSession.builder \
   .master("local[8]") \
   .appName("weather-api") \
   .config("spark.executor.memory", "1gb") \
   .getOrCreate()

# Main entry point for Spark functionality. A SparkContext represents the
# connection to a Spark cluster, and can be used to create :class:`RDD` and
# broadcast variables on that cluster.
sc = spark.sparkContext

# defining location name, latitude and longtitude to be used in the for loop to make one API call per location
cities = ["London","Dublin","Belfast","Manchester","Liverpool","Miami","LA","Dallas"]
lat = ["51.509865","53.350140","54.607868","53.483959","53.400002","25.761681","34.052235","44.919285"]
lon = ["-0.118092","-6.266155","-5.926437","-2.244644","-2.983333","-80.191788","-118.243683","-123.317047"]
units = "metric"
appid = "709b1913c287bae0885131e57405bacc"

## specification of how many days should be substracted from today
## the maximum is 5 since the API rights allow us to get historical data from 5 days ago
days=2 # 2 because the code was run on 03.07.2021 and we need the weather data from 01.07.2021

# getting today's timestamp rounded to hours
d = datetime.combine(date.today(), datetime.min.time())

for i in range(len(cities)):
    dt = d - timedelta(days=days) #substracting days from today to get the data from 01.07.2021
    UTX = str(round((dt - datetime(1970, 1, 1)).total_seconds(),)) #converting the desired date into UTX time format
    # accessing the data with dynamic link which iterates over different cities
    call = "http://api.openweathermap.org/data/2.5/onecall/timemachine?lat="+lat[i]+"&lon="+lon[i]+"&units="+units+"&dt="+UTX+"&appid="+appid
    
    # read the online data file
    httpData = urlopen(call).read().decode('utf-8')
    # convert into RDD
    rdd = sc.parallelize([httpData])
    # create a Dataframe
    response = spark.read.json(rdd)
    
    # converting the dataset into 2 dimensional dataframe by using hourly observations as rows and selecting only non-nested fields
    hourly = response.withColumn('results',F.explode('hourly')).select('lat','lon','timezone','timezone_offset','results.*')
    hourly = hourly.select('lat','lon','timezone','timezone_offset','clouds','dew_point','dt','feels_like','humidity','pressure','temp','visibility','wind_deg','wind_speed')
    
    # creating dynamic folder name for each city
    folder_name = str(dt.strftime("%y%m%d"))+"_weather_"+str(cities[i])
    file_path = "weather_data/"+folder_name
    
    # saving the dataset in one csv file for each city
    hourly.coalesce(1).write.mode('overwrite').save(file_path, format="csv", header='true')