# General Set-Up

To accomplish this task, I used a Windows machine with Pyspark set-up. And since it contains multiple questions, I would rely on Jupyter Notebook to quickly prototype a solution with some comments. 

In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import when, lit, col, expr
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import col, udf
from pyspark.sql import functions as F


spark = SparkSession.builder.appName("victor_paytm").getOrCreate()
cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
spark

# Step 1
### 1. Load the global weather data into Spark

In [2]:
partitioned =spark.read.option("header", "true").csv('data/2019/')

In [3]:
partitioned = partitioned.withColumn("PRCP",expr("substring(PRCP, 1, length(PRCP)-1)"))
partitioned = partitioned.withColumn("MAX",partitioned.MAX.substr(1, 4))
partitioned = partitioned.withColumn("MIN",partitioned.MAX.substr(1, 4))
partitioned =partitioned.withColumnRenamed("STN---", "STN")
partitioned.show(5)

+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+-----+------+
|   STN| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD|GUST| MAX| MIN|PRCP| SNDP|FRSHTT|
+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+-----+------+
|010260|99999|20190101|26.1|21.2|1001.9| 987.5| 20.6| 9.0| 15.9|29.7|29.8|29.8|0.02| 18.5|001000|
|010260|99999|20190102|24.9|22.1|1020.1|1005.5|  5.4| 5.6| 13.6|22.1|27.1|27.1|0.48| 22.8|001000|
|010260|99999|20190103|31.7|29.1|1008.9| 994.7| 13.6|11.6| 21.4|49.5|37.4|37.4|0.25|999.9|011000|
|010260|99999|20190104|32.9|30.3|1011.4| 997.1| 15.8| 4.9|  7.8|10.9|36.1|36.1|0.52|999.9|001000|
|010260|99999|20190105|35.5|33.0|1015.7|1001.4| 12.0|10.4| 13.6|21.0|38.5|38.5|0.02| 23.6|010000|
+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+-----+------+
only showing top 5 rows



### Then, I would handle the missing value in this case

In [4]:
def replace(column, value):
    return when(column != value, column).otherwise(lit(None))

nullDict = {'TEMP': 9999.9,
            'DEWP': 9999.9,
            'SLP': 9999.9,
            'STP': 9999.9,
            'VISIB': 999.9,
            'WDSP': 999.9,
            'MXSPD': 999.9,
            'GUST':999.9,
            'MAX':9999.9,
            'MIN':9999.9,
            'PRCP':99.9,
            'SNDP':999.9}

for key, val in nullDict.items():
    partitioned = partitioned.withColumn(key,replace(col(key), val))

In [5]:
partitioned.show(5)

+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+----+------+
|   STN| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD|GUST| MAX| MIN|PRCP|SNDP|FRSHTT|
+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+----+------+
|010260|99999|20190101|26.1|21.2|1001.9| 987.5| 20.6| 9.0| 15.9|29.7|29.8|29.8|0.02|18.5|001000|
|010260|99999|20190102|24.9|22.1|1020.1|1005.5|  5.4| 5.6| 13.6|22.1|27.1|27.1|0.48|22.8|001000|
|010260|99999|20190103|31.7|29.1|1008.9| 994.7| 13.6|11.6| 21.4|49.5|37.4|37.4|0.25|null|011000|
|010260|99999|20190104|32.9|30.3|1011.4| 997.1| 15.8| 4.9|  7.8|10.9|36.1|36.1|0.52|null|001000|
|010260|99999|20190105|35.5|33.0|1015.7|1001.4| 12.0|10.4| 13.6|21.0|38.5|38.5|0.02|23.6|010000|
+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+----+------+
only showing top 5 rows



###  2. Join the stationlist.csv with the countrylist.csv to get the full country name for each station number.

In [6]:
stationlist = spark.read.option("header", "true").csv('stationlist.csv')
stationlist.show(5)

+------+------------+
|STN_NO|COUNTRY_ABBR|
+------+------------+
|012240|          NO|
|020690|          SW|
|020870|          SW|
|021190|          SW|
|032690|          UK|
+------+------------+
only showing top 5 rows



In [7]:
countrylist = spark.read.option("header", "true").csv('countrylist.csv')
countrylist.show(5)

+------------+-------------------+
|COUNTRY_ABBR|       COUNTRY_FULL|
+------------+-------------------+
|          AA|              ARUBA|
|          AC|ANTIGUA AND BARBUDA|
|          AF|        AFGHANISTAN|
|          AG|            ALGERIA|
|          AI|   ASCENSION ISLAND|
+------------+-------------------+
only showing top 5 rows



In [8]:
stationWithCountry = stationlist.join(countrylist,stationlist.COUNTRY_ABBR ==  countrylist.COUNTRY_ABBR,"left")

### 3. Join the global weather data with the full country names by station number

In [9]:
globWeatherCountry = partitioned.join(stationWithCountry, partitioned.STN == stationWithCountry.STN_NO,"left")
globWeatherCountry.show(5)

+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+----+------+------+------------+------------+------------+
|   STN| WBAN|YEARMODA|TEMP|DEWP|   SLP|   STP|VISIB|WDSP|MXSPD|GUST| MAX| MIN|PRCP|SNDP|FRSHTT|STN_NO|COUNTRY_ABBR|COUNTRY_ABBR|COUNTRY_FULL|
+------+-----+--------+----+----+------+------+-----+----+-----+----+----+----+----+----+------+------+------------+------------+------------+
|010260|99999|20190101|26.1|21.2|1001.9| 987.5| 20.6| 9.0| 15.9|29.7|29.8|29.8|0.02|18.5|001000|010260|          NO|          NO|      NORWAY|
|010260|99999|20190102|24.9|22.1|1020.1|1005.5|  5.4| 5.6| 13.6|22.1|27.1|27.1|0.48|22.8|001000|010260|          NO|          NO|      NORWAY|
|010260|99999|20190103|31.7|29.1|1008.9| 994.7| 13.6|11.6| 21.4|49.5|37.4|37.4|0.25|null|011000|010260|          NO|          NO|      NORWAY|
|010260|99999|20190104|32.9|30.3|1011.4| 997.1| 15.8| 4.9|  7.8|10.9|36.1|36.1|0.52|null|001000|010260|          NO|          NO|      NORWAY|