# Warm-up 

In [1]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, FloatType, LongType, StringType

feats = []
f = open('features.txt')
for line_num, line in enumerate(f):
    if line_num == 0:
        # Timestamp
        feats.append(StructField(line.strip(), LongType(), True))
    elif line_num == 1:
        # Geohash
        feats.append(StructField(line.strip(), StringType(), True))
    else:
        # Other features
        feats.append(StructField(line.strip(), FloatType(), True))
    
schema = StructType(feats)

print(schema)

StructType(List(StructField(Timestamp,LongType,true),StructField(Geohash,StringType,true),StructField(geopotential_height_lltw,FloatType,true),StructField(water_equiv_of_accum_snow_depth_surface,FloatType,true),StructField(drag_coefficient_surface,FloatType,true),StructField(sensible_heat_net_flux_surface,FloatType,true),StructField(categorical_ice_pellets_yes1_no0_surface,FloatType,true),StructField(visibility_surface,FloatType,true),StructField(number_of_soil_layers_in_root_zone_surface,FloatType,true),StructField(categorical_freezing_rain_yes1_no0_surface,FloatType,true),StructField(pressure_reduced_to_msl_msl,FloatType,true),StructField(upward_short_wave_rad_flux_surface,FloatType,true),StructField(relative_humidity_zerodegc_isotherm,FloatType,true),StructField(categorical_snow_yes1_no0_surface,FloatType,true),StructField(u-component_of_wind_tropopause,FloatType,true),StructField(surface_wind_gust_surface,FloatType,true),StructField(total_cloud_cover_entire_atmosphere,FloatType,tru

In [2]:
#df = spark.read.format('csv').option('sep', '\t').schema(schema).load('/Volumes/evo/Datasets/NAM_2015_S/*')
df = spark.read.format('csv').option('sep', '\t').schema(schema).load('hdfs://orion11:20910/datasets/*')
df.take(1)

[Row(Timestamp=1430438400000, Geohash='dndf9tz5r8eb', geopotential_height_lltw=1915.593994140625, water_equiv_of_accum_snow_depth_surface=0.0, drag_coefficient_surface=0.0, sensible_heat_net_flux_surface=-12.571273803710938, categorical_ice_pellets_yes1_no0_surface=0.0, visibility_surface=24220.529296875, number_of_soil_layers_in_root_zone_surface=3.0, categorical_freezing_rain_yes1_no0_surface=0.0, pressure_reduced_to_msl_msl=101235.0, upward_short_wave_rad_flux_surface=4.25, relative_humidity_zerodegc_isotherm=95.0, categorical_snow_yes1_no0_surface=0.0, u-component_of_wind_tropopause=20.28228759765625, surface_wind_gust_surface=3.9325132369995117, total_cloud_cover_entire_atmosphere=98.0, upward_long_wave_rad_flux_surface=371.25927734375, land_cover_land1_sea0_surface=1.0, vegitation_type_as_in_sib_surface=10.0, v-component_of_wind_pblri=-3.47259521484375, albedo_surface=17.25, lightning_surface=0.0, ice_cover_ice1_no_ice0_surface=0.0, convective_inhibition_surface=-12.582763671875,

In [3]:
started_at = datetime.now()

df.describe('precipitable_water_entire_atmosphere').show()

print("Finished. it's been " + str((datetime.now()-started_at).seconds) + " seconds")

+-------+------------------------------------+
|summary|precipitable_water_entire_atmosphere|
+-------+------------------------------------+
|  count|                           323759844|
|   mean|                  22.161301063122153|
| stddev|                  14.066149350632585|
|    min|                          0.36906433|
|    max|                            99.31735|
+-------+------------------------------------+

Finished. it's been 194 seconds


## Q1: Unknown Feature: Choose a feature from the data dictionary above that you have never heard of before. Inspect some of the values for the feature (such as its average, min, max, etc.) and try to guess what it measures. Was your hypothesis correct? (Note: if you are a professional meteorologist, you can skip this question ;-))



As shown the max water volumn is almost 5 times than the mean volumn. Max is 99.31735. Google says:
The amount of precipitation that falls around the world may range from less than 0.1 inch per year in some deserts to more than 900 inches per year in the tropics. 
Then, I found, the amount of water volumn is really small.

## Q2: Hot hot hot: When and where was the hottest temperature observed in the dataset? Is it an anomaly?

In [3]:
started_at = datetime.now()

df.describe('temperature_surface').show()

print("Finished. it's been " + str((datetime.now()-started_at).seconds) + " seconds")

df.describe('temperature_tropopause').show()

print("Finished. it's been " + str((datetime.now()-started_at).seconds) + " seconds")


+-------+-------------------+
|summary|temperature_surface|
+-------+-------------------+
|  count|          323759844|
|   mean|  287.8572096833751|
| stddev| 13.716833944537786|
|    min|          218.99284|
|    max|          331.39062|
+-------+-------------------+

Finished. it's been 299 seconds
+-------+----------------------+
|summary|temperature_tropopause|
+-------+----------------------+
|  count|             323759844|
|   mean|     209.9530500232586|
| stddev|      9.63668344469262|
|    min|             180.93707|
|    max|             261.67834|
+-------+----------------------+

Finished. it's been 474 seconds


There are two features concerned about temperature. I will analyze the temperature surface first. Then, do the same analysis for temperature tropopause.

In [4]:
surf_top_list = df.filter(df.temperature_surface > 331.3).take(10)
for ele in surf_top_list:
    print(ele)

Row(Timestamp=1440266400000, Geohash='d5dpds10m55b', geopotential_height_lltw=3781.90185546875, water_equiv_of_accum_snow_depth_surface=0.0, drag_coefficient_surface=0.0, sensible_heat_net_flux_surface=557.9180908203125, categorical_ice_pellets_yes1_no0_surface=0.0, visibility_surface=24224.66015625, number_of_soil_layers_in_root_zone_surface=4.0, categorical_freezing_rain_yes1_no0_surface=0.0, pressure_reduced_to_msl_msl=101332.0, upward_short_wave_rad_flux_surface=111.56889343261719, relative_humidity_zerodegc_isotherm=12.0, categorical_snow_yes1_no0_surface=0.0, u-component_of_wind_tropopause=-2.6672048568725586, surface_wind_gust_surface=2.5530638694763184, total_cloud_cover_entire_atmosphere=12.0, upward_long_wave_rad_flux_surface=682.604736328125, land_cover_land1_sea0_surface=1.0, vegitation_type_as_in_sib_surface=2.0, v-component_of_wind_pblri=0.5142669677734375, albedo_surface=11.0, lightning_surface=0.0, ice_cover_ice1_no_ice0_surface=0.0, convective_inhibition_surface=0.1108

I use a tricky method to get the row containing the max temperature. 

First, use the describe method and get the max value. 

Then, use the filter to get the rows whose value is greater than (max - a_tiny_number). 

Then take the 10 elements. 

If the number of returned elements is 10, then enlarge the a_tiny_number value, so the make the boundy more close to the max. At the last round, the row contianing max value will be left.



In [6]:
max_ele = surf_top_list[0]
print('Timestamp: ', datetime.fromtimestamp(max_ele.Timestamp/1000).isoformat())
print('Geohash: ', max_ele.Geohash)

Timestamp:  2015-08-22T11:00:00
Geohash:  d5dpds10m55b


In [7]:
trop_top_list = df.filter(df.temperature_tropopause > 261.6).take(10)
for ele in trop_top_list:
    print(ele)

Row(Timestamp=1451217600000, Geohash='9sgfcsv9d3s0', geopotential_height_lltw=944.68359375, water_equiv_of_accum_snow_depth_surface=1.0, drag_coefficient_surface=0.0, sensible_heat_net_flux_surface=-5.949005126953125, categorical_ice_pellets_yes1_no0_surface=0.0, visibility_surface=24223.779296875, number_of_soil_layers_in_root_zone_surface=3.0, categorical_freezing_rain_yes1_no0_surface=0.0, pressure_reduced_to_msl_msl=102038.0, upward_short_wave_rad_flux_surface=0.0, relative_humidity_zerodegc_isotherm=81.0, categorical_snow_yes1_no0_surface=0.0, u-component_of_wind_tropopause=43.98924255371094, surface_wind_gust_surface=4.927008628845215, total_cloud_cover_entire_atmosphere=0.0, upward_long_wave_rad_flux_surface=276.87347412109375, land_cover_land1_sea0_surface=1.0, vegitation_type_as_in_sib_surface=7.0, v-component_of_wind_pblri=-3.43695068359375, albedo_surface=26.5, lightning_surface=0.0, ice_cover_ice1_no_ice0_surface=0.0, convective_inhibition_surface=-1.0068359375, pressure_su

In [8]:
import datetime
import time

max_ele = trop_top_list[0]
print('Timestamp: ', datetime.datetime.fromtimestamp(max_ele.Timestamp/1000).isoformat())
print('Geohash: ', max_ele.Geohash)

Timestamp:  2015-12-27T04:00:00
Geohash:  9sgfcsv9d3s0


There are two elements in the result list and obvious, the bigger one locates at position 0. 
Then, we have the timestamp and geohash.
Based on the obeervation, there is nothing special for the data. One is at 2015-08-22T11:00:00 and another one is 2015-12-27T04:00:00.
They are both at the end of the month. 

## Q3: So Snowy: Find a location that is snowy all year (there are several). Locate a nearby town/city and provide a small writeup about it. Include pictures if you’d like.

In [3]:
# Creating an SQL 'table'
df.createOrReplaceTempView("TEMP_DF")

In [4]:
started_at = datetime.now()

snow = spark.sql("SELECT Geohash, count(Geohash) FROM TEMP_DF Where categorical_snow_yes1_no0_surface=1 group by Geohash order by count(Geohash) desc").collect()

print("Finished. it's been " + str((datetime.now()-started_at).seconds) + " seconds")


Finished. it's been 556 seconds


In [7]:
for i in range(10):
    print(snow[i])





Row(Geohash='c43kcu3t702p', count(Geohash)=436)
Row(Geohash='c43k6uu1egxb', count(Geohash)=436)
Row(Geohash='c41uhb4r5n00', count(Geohash)=434)
Row(Geohash='c41ueb1jyypb', count(Geohash)=434)
Row(Geohash='c41v48pupf00', count(Geohash)=432)
Row(Geohash='c438x5esgf00', count(Geohash)=422)
Row(Geohash='c41v98n9w0xb', count(Geohash)=421)
Row(Geohash='c43b05v7222p', count(Geohash)=421)
Row(Geohash='c438fqgmsm00', count(Geohash)=417)
Row(Geohash='c439n53vsxzz', count(Geohash)=417)


I found the first ten locations have snow days over 365. Then, let's check the first one.

<img src="warm_up_3.png"
     alt="location"
     style="float: left; margin-right: 10px;" />

As shown in the screenshot, the location is on the mountain, close to Sheslay and Juneau and within the Atlin Provincial Park and Recreation area. 