### Visualizing the distribution of the observations

### Load the required libraries

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import urllib
import math
%pylab inline

import findspark
findspark.init()

from pyspark import SparkContext
#sc.stop()
sc = SparkContext(master="local[3]",pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStats.py'])

from pyspark import SparkContext
from pyspark.sql import *
sqlContext = SQLContext(sc)

import sys
sys.path.append('./lib')

import numpy as np
from numpy_pack import packArray,unpackArray
from spark_PCA import computeCov
from computeStats import computeOverAllDist, STAT_Descriptions

### Read the data frame from pickle file

data_dir='../../Data/Weather'
file_index='BBSBSBSB'

from pickle import load

#read statistics
filename=data_dir+'/STAT_%s.pickle'%file_index
STAT,STAT_Descriptions = load(open(filename,'rb'))
print 'keys from STAT=',STAT.keys()

#read data
filename=data_dir+'/US_Weather_%s.parquet'%file_index

df=sqlContext.read.parquet(filename)
print df.count()
df.show(5)

In [None]:
STAT_Descriptions

In [None]:
import seaborn as sns

# The data is only 12,493 rows. That's small enough to fit into a pandas dataframe.

In [None]:
dfData = df.toPandas()

## The data is from weather stations in the United States. It appears that our data is from a rectangular region that covers Minnesota and a lower portion of Ontario, Canada.

In [None]:
#!pip install geopy
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location = geolocator.reverse('{}, {}'.format(dfData['latitude'].median(), dfData['longitude'].median())) 

print('The geographical center of the stations is: {}'.format(location.address))

print ('Northwest geographical corner is: {}'.format(\
                                                      geolocator.reverse('{}, {}'.\
                                                                         format(dfData['latitude'].max(), \
                                                                                dfData['longitude'].min())).address ))
print ('Northeast geographical corner is: {}'.format(\
                                                      geolocator.reverse('{}, {}'.\
                                                                         format(dfData['latitude'].max(), \
                                                                                dfData['longitude'].max())).address ))
print ('Southwest geographical corner is: {}'.format(\
                                                      geolocator.reverse('{}, {}'.\
                                                                         format(dfData['latitude'].min(), \
                                                                                dfData['longitude'].min())).address ))
print ('Southeast geographical corner is: {}'.format(\
                                                      geolocator.reverse('{}, {}'.\
                                                                         format(dfData['latitude'].min(), \
                                                                                dfData['longitude'].max())).address ))

from geopy import distance
from geopy import Point

p1 = Point('{}, {}'.format(dfData['latitude'].max(), dfData['longitude'].min()))
p2 = Point('{}, {}'.format(dfData['latitude'].max(), dfData['longitude'].max()))
eastWest = distance.distance(p1,p2).kilometers
p1 = Point('{}, {}'.format(dfData['latitude'].min(), dfData['longitude'].min()))
p2 = Point('{}, {}'.format(dfData['latitude'].min(), dfData['longitude'].max()))
northSouth = distance.distance(p1,p2).kilometers

print('The East-West distance is {:.3f} kilometers'.format(eastWest))
print('The North-South distance is {:.3f} kilometers'.format(northSouth))
print('The area covered by the data is {:,.3f} square kilometers'.format(northSouth*eastWest))


In [None]:
#!pip install folium
import folium
from folium import plugins, IFrame

In [None]:
dfData.describe()

## The minimum elevation is -999.9. It's a safe bet that the station is not 1000 meters below sea level. My guess is that this is a proxy meaning "no value". We should check this and replace -999.9 with NaN if so.

In [None]:
plt.plot(dfData['elevation']); 
plt.title('Elevation Sanity Check');
plt.ylabel('Elevation [m]');
plt.xlabel('Occurrence');
plt.plot(dfData['elevation'][dfData['elevation'] == -999.9], 'r*');

## As the graph shows, there are about 3 places (red stars) where the elevation was listed as -999.9. So we should just change this to NaN.

In [None]:
elevation = dfData['elevation'].copy(deep=True)
elevation[elevation == -999.9] = NaN
dfData['elevation'] = elevation

In [None]:
dfData['undefs'].describe()

In [None]:
dfData['elevation'].hist();
ylabel('Number of records at this elevation');
xlabel('Elevation [m]');

## Now the elevation data seems reasonable. Note that the area we are surveying is at a fairly similar elevation. The mean elevation is 384 meters with a standard deviation of about 32 meters (a third of a football field). Therefore, we don't expect to see much influence from elevation. For comparison, the highest elevation in Minnesota is [Eagle Mountain](<https://en.wikipedia.org/wiki/Eagle_Mountain_%28Minnesota%29) at 701 meters. The lowest elevation is [Lake Superior](https://en.wikipedia.org/wiki/Lake_Superior) at 183 meters. The southeast section of our geographical area includes part of Lake Superior. We'll view the terrain when we plot our map in Leaflet. There are several lakes and rivers within our region. It would be interesting to compare the precipitation patterns against a similar region without as many bodies of water.

In [None]:
dfData['elevation'].describe()

In [None]:
print ('There are {} unique stations in the data.'.format(len(dfData['station'].unique())))
print ('The records start in the year {:.0f} and end in {:.0f}.'.format(dfData['year'].min(), dfData['year'].max()))

## In real data, we often are confronted with missing or undefined values (NaN). Let's see the average number of undefined values per row in our dataset.

In [None]:
dfData['undefs'].hist(bins=50, normed=True);
xlabel('Missing measurements')
ylabel('Percent of observations');

## So the histogram shows that all rows have fewer than 50 missing values. Twenty percent of the rows have no missing values. So there should be enough data present to make some inference.

## The number of observations increased dramatically around 1950 and has been fairly constant since then. This is directly related to the number of stations in operation as seen from the graph below.

In [None]:
dfData['year'].hist(bins=30);
xlabel('Year')
ylabel('Number of observations');

In [None]:
plt.plot(dfData.groupby('year')['station'].count(), 'o-');
xlabel('Year');
ylabel('Number of active stations');
#title('More stations came on line each year');
plt.savefig('STATIONS.svg')

## Let's check the measurements to make sure they seem to contain reasonable values. We'll just plot the histograms and make sure that they don't have too many outliers.

In [None]:
STAT.keys()

In [None]:
'''
Data Sanity check
Let's check the histograms for all of our measurements.
'''
sqlContext.registerDataFrameAsTable(df,'weather')
i = 0

fig, axs = plt.subplots(nrows = 2, ncols = 3, figsize=(12, 8))
axs = axs.ravel()    # Unravel the axes list so that we can access axes in for loop

for meas in STAT.keys():
    
    Query="SELECT * FROM weather WHERE measurement='%s'"%(meas)
    dfMeasure = sqlContext.sql(Query)
    rows=dfMeasure.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()
    D=np.vstack(rows)
    if (meas in ['TMIN', 'TOBS', 'TMAX']):
        D /= 10   # According to the README, these values are recorded in "tenths of mm"
        
    axs[i].hist(D[~np.isnan(D)].ravel(), bins=20, normed=True);
    axs[i].set_title(meas + ' Histogram', fontsize=14, fontweight='bold');
    i += 1

plt.savefig('ALLMEASURES.svg')

## These values seem reasonable. The temperatures range is -20 to +20 C. Most days have no precipitation. The snow ranges between 0 and 1 m (40 inches). Other precipitation ranges between 0 and 40 mm (1.6 inches). These are within normal weather values through the year in the northern United States.

### Select data for a particular station and measurement type

In [None]:
sqlContext.registerDataFrameAsTable(df,'weather')
Query="SELECT * FROM weather\n\tWHERE measurement='%s' and station='%s'"%('PRCP','USC00219059')
print Query
df1 = sqlContext.sql(Query)
print df1.count(),'rows'
df1.show(2)
rows=df1.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()
T=np.vstack(rows)
T=T/10.  # scaling to make the temperature be in centingrates
shape(T)

# Let's take a look at how the average minimum and maximum daily temperature changes from year to year

In [None]:
Query="SELECT * FROM weather\n\tWHERE measurement='TMIN' ORDER BY year"
df1 = sqlContext.sql(Query)
dfTMIN = df1.toPandas()
dfTMIN['temps'] = dfTMIN['vector'].map(lambda row: unpackArray(row, np.float16)/10.0)
dfTMIN['TMIN_mean_year'] = dfTMIN['temps'].map(np.nanmean)

#plt.plot(dfTMIN['year'].unique(), dfTMIN[['year','TMIN_mean_year']].groupby('year').mean());
plt.scatter(dfTMIN['year'], dfTMIN['TMIN_mean_year']);
xlabel('Year');
ylabel(r'Average ($^o$C)');
title('Minimum Daily Temperature');

In [None]:
#!pip install folium
import folium
from folium import plugins, IFrame

In [None]:
# Get the minimum and maximum latitude and longitudes for our geographical area
geoArea = sqlContext.sql('select min(latitude), max(latitude), min(longitude), max(longitude) from weather').collect()[0]
min_lat = np.floor(geoArea[0])
max_lat = np.ceil(geoArea[1])
min_lon = np.floor(geoArea[2])
max_lon = np.ceil(geoArea[3])

box = (min_lat, max_lat, min_lon, max_lon)

center = [(min_lat+max_lat)/2, (min_lon+max_lon)/2]
zoom = 6

In [None]:
m2 = folium.Map(center, zoom_start=zoom, tiles='Stamen Terrain')

folium.features.RectangleMarker(
    bounds=[[min_lat,min_lon],[max_lat,max_lon]],
    color='blue',
    fill_color='red', fill_opacity=0.1,
    popup='Weather Service Data').add_to(m2)


import branca.colormap as cm
colormap = cm.linear.Paired.scale(-4,4)
colormap.caption = 'Average daily minimum temperature (C)'
m2.add_child(colormap)

data = dfTMIN[dfTMIN['year'] < 1950]
#plugins.HeatMap(data[['latitude', 'longitude', 'TMIN_mean_year']].values, \
#               radius=10).add_to(m2)

for i in range(data.shape[0]):
    
    folium.CircleMarker([data['latitude'].values[i], data['longitude'].values[i]], radius=20,
                       fill_color=colormap(data['TMIN_mean_year'].values[i])).add_to(m2)
    
plugins.Fullscreen(
    position='topright',
    title='Enter fullscreen mode',
    titleCancel='Exit fullscreen mode dear Triton',
    forceSeparateButton=True).add_to(m2)

m2

In [None]:
m3 = folium.Map(center, zoom_start=zoom, tiles='Stamen Terrain')

folium.features.RectangleMarker(
    bounds=[[min_lat,min_lon],[max_lat,max_lon]],
    color='blue',
    fill_color='red', fill_opacity=0.1,
    popup='Weather Service Data').add_to(m3)


import branca.colormap as cm
colormap = cm.linear.Paired.scale(-4,4)
colormap.caption = 'Average daily minimum temperature (C)'
m3.add_child(colormap)

data = dfTMIN[dfTMIN['year'] >= 1950]
#plugins.HeatMap(data[['latitude', 'longitude', 'TMIN_mean_year']].values, \
#               radius=10).add_to(m2)

for i in range(data.shape[0]):
    
    folium.CircleMarker([data['latitude'].values[i], data['longitude'].values[i]], radius=20,
                       fill_color=colormap(data['TMIN_mean_year'].values[i])).add_to(m3)
    
plugins.Fullscreen(
    position='topright',
    title='Enter fullscreen mode',
    titleCancel='Exit fullscreen mode dear Triton',
    forceSeparateButton=True).add_to(m3)

m2

### Script for plotting yearly plots

In [None]:
from YearPlotter import YearPlotter
fig, ax = plt.subplots(figsize=(10,7));
YP=YearPlotter()
YP.plot(T[:2,:].transpose(),fig,ax,title='PRCP')
#title('A sample of graphs');

### Distribution of missing observations
The distribution of missing observations is not uniform throughout the year. We visualize it below.

In [None]:
def plot_pair(pair,func):
    j=0
    fig,X=subplots(1,2,figsize=(16,6))
    axes=X.reshape(2)
    for m in pair:
        axis = axes[j]
        j+=1
        func(m,fig,axis)
        
def plot_valid(m,fig,axis):
    valid_m=STAT[m]['NE']
    YP.plot(valid_m,fig,axis,title='valid-counts '+m)
    

In [None]:
plot_pair(['TMIN','TMAX'],plot_valid)

In [None]:
plot_pair(['TOBS','PRCP'],plot_valid)

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_valid)

### Plots of mean and std of observations

In [None]:
def plot_mean_std(m,fig,axis):
    mean=STAT[m]['Mean']
    std=np.sqrt(STAT[m]['Var'])
    graphs=np.vstack([mean-std,mean,mean+std]).transpose()
    YP.plot(graphs,fig,axis,title='Mean+-std   '+m)

In [None]:
plot_pair(['TMIN','TMAX'],plot_mean_std)

In [None]:
plot_pair(['TOBS','PRCP'],plot_mean_std)

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_mean_std)

### plotting top 3 eigenvectors

In [None]:
def plot_eigen(m,fig,axis):
    EV=STAT[m]['eigvec']
    YP.plot(EV[:,:3],fig,axis,title='Top Eigenvectors '+m)

In [None]:
plot_pair(['TMIN','TMAX'],plot_eigen)

In [None]:
plot_pair(['TOBS','PRCP'],plot_eigen)

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_eigen)

### Script for plotting percentage of variance explained

In [None]:
def pltVarExplained(j):
    subplot(1,3,j)
    EV=STAT[m]['eigval']
    k=5
    plot(([0,]+list(cumsum(EV[:k])))/sum(EV), 'o-')
    title('Percentage of Variance Explained for '+ m)
    ylabel('Percentage of Variance')
    xlabel('# Eigenvector')
    grid()
    

In [None]:
f=plt.figure(figsize=(15,4))
j=1
for m in ['TMIN', 'TOBS', 'TMAX']: #,
    pltVarExplained(j)
    j+=1

In [None]:
f=plt.figure(figsize=(15,4))
j=1
for m in ['SNOW', 'SNWD', 'PRCP']:
    pltVarExplained(j)
    j+=1 

In [None]:
#sc.stop()