In [None]:
#setup
data_dir='../../Data/Weather'
file_index='BBSBSBSB'
m='SNWD'

## Reconstruction using top eigen-vectors
For measurement = {{m}}

## Load the required libraries

In [None]:
# Enable automiatic reload of libraries
#%load_ext autoreload
#%autoreload 2 # means that all modules are reloaded before every command

In [None]:
#%matplotlib inline
%pylab inline
import numpy as np

import findspark
findspark.init()

import sys
sys.path.append('./lib')

from numpy_pack import packArray,unpackArray

from Eigen_decomp import Eigen_decomp
from YearPlotter import YearPlotter
from recon_plot import recon_plot

from import_modules import import_modules,modules
import_modules(modules)

from ipywidgets import interactive,widgets

import seaborn as sns

In [None]:
from pyspark import SparkContext
#sc.stop()

sc = SparkContext(master="local[3]",pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStats.py','lib/recon_plot.py','lib/Eigen_decomp.py'])

from pyspark import SparkContext
from pyspark.sql import *
sqlContext = SQLContext(sc)


## Read Statistics File

In [None]:
from pickle import load

#read statistics
filename=data_dir+'/STAT_%s.pickle'%file_index
STAT,STAT_Descriptions = load(open(filename,'rb'))
measurements=STAT.keys()
print 'keys from STAT=',measurements

## Read data file into a spark DataFrame
We focus on the snow-depth records, because the eigen-vectors for them make sense.

In [None]:
#read data
filename=data_dir+'/US_Weather_%s.parquet'%file_index
df_in=sqlContext.read.parquet(filename)
#filter in 
df=df_in.filter(df_in.measurement==m)
df.show(5)

In [None]:
from scipy.stats import ttest_1samp

### Plot Reconstructions

Construct approximations of a time series using the mean and the $k$ top eigen-vectors
First, we plot the mean and the top $k$ eigenvectors

In [None]:
import pylab as plt
fig,axes=plt.subplots(2,1, sharex='col', sharey='row',figsize=(10,6));
k=3
EigVec=np.matrix(STAT[m]['eigvec'][:,:k])
Mean=STAT[m]['Mean']
YearPlotter().plot(Mean,fig,axes[0],label='Mean',title=m+' Mean')
YearPlotter().plot(EigVec,fig,axes[1],title=m+' Eigs',labels=['eig'+str(i+1) for i in range(k)])

In [None]:
v=[np.array(EigVec[:,i]).flatten() for i in range(np.shape(EigVec)[1])]

### plot the percent of residual variance on average

In [None]:
#  x=0 in the graphs below correspond to the fraction of the variance explained by the mean alone
#  x=1,2,3,... are the residuals for eig1, eig1+eig2, eig1+eig2+eig3 ...
fig,ax=plt.subplots(1,1);
eigvals=STAT[m]['eigval']; eigvals/=sum(eigvals); cumvar=np.cumsum(eigvals); cumvar=100*np.insert(cumvar,0,0)
ax.plot(cumvar[:4], 'o-'); 
ax.grid(); 
ax.set_ylabel('Percent of variance explained')
ax.set_xlabel('number of eigenvectors')
ax.set_title('Percent of variance explained');

## Process whole dataframe to find best and worse residuals

### Add to each row in the dataframe a residual values 
Residuals are after subtracting in sequence: the mean, the projection on the first eigen-vector the projection on the second eigen-vector etc.

`decompose(row)` axtracts the series from the row, computes the residuals and constructs a new row that is reassembled into a dataframe.


In [None]:
def decompose(row):
    """compute residual and coefficients for decomposition           

    :param row: SparkSQL Row that contains the measurements for a particular station, year and measurement. 
    :returns: the input row with additional information from the eigen-decomposition.
    :rtype: SparkSQL Row 

    Note that Decompose is designed to run inside a spark "map()" command.
    Mean and v are sent to the workers as local variables of "Decompose"

    """
    Series=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
    recon=Eigen_decomp(None,Series,Mean,v);
    total_var,residuals,reductions,coeff=recon.compute_var_explained()
    #print coeff
    residuals=[float(r) for r in residuals[1]]
    coeff=[float(r) for r in coeff[1]]
    D=row.asDict()
    D['total_var']=float(total_var[1])
    D['res_mean']=residuals[0]
    for i in range(1,len(residuals)):
        D['res_'+str(i)]=residuals[i]
        D['coeff_'+str(i)]=coeff[i-1]
    return Row(**D)


In [None]:
rdd2=df.rdd.map(decompose)
df2=sqlContext.createDataFrame(rdd2)
row,=df2.take(1)

#filter out vectors for which the mean is a worse approximation than zero.
print 'before filter',df2.count()
df3=df2.filter(df2.res_mean<1)
print 'after filter',df3.count()

In [None]:
'''
nanmean sometimes gives infinity when
there are nan and the other values are float16.
so convert first to float64 seems to solve this.
http://stackoverflow.com/questions/24313649/why-does-numpy-mean-return-inf
'''
def customMean(x):
    return np.nanmean(np.float64(x), axis=0)

In [None]:
fig, ax = plt.subplots(figsize=(6,4));
YP=YearPlotter()

interval = 10 # years between 
startYear = 1949

dfSnow=df2.filter(df2.measurement==m).filter(df2.year < (startYear+interval)) 

rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

Tbaseline=customMean(np.vstack(rows))
YP.plot(Tbaseline.transpose(),fig,ax,title=m)

l = ['{}-{}'.format(startYear, startYear+interval)]

for i in range(startYear+interval, 2000, interval):

    dfSnow=df2.filter(df2.measurement==m).filter(df2.year >= i).filter(df2.year < (i+interval))

    rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

    T=customMean(np.vstack(rows))  #- Tbaseline
    YP.plot(T.transpose(),fig,ax,title=m);
    l += ['{}-{}'.format(i, i+interval)]
    
plt.legend(l);
plt.grid();
ylabel('Snow Depth (mm)');
plt.savefig('SNWD.svg')

In [None]:
fig, ax = plt.subplots(figsize=(15,10));
YP=YearPlotter()

#interval = 20 # years between 
#startYear = 1949

dfSnow=df2.filter(df2.measurement==m).filter(df2.year < (startYear+interval)) 

rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

Tbaseline=customMean(np.vstack(rows))
#YP.plot(Tbaseline.transpose(),fig,ax,title=m)

l = [] #['{}-{}'.format(startYear, startYear+interval)]

for i in range(startYear+interval, 2000, interval):

    dfSnow=df2.filter(df2.measurement==m).filter(df2.year >= i).filter(df2.year < (i+interval))

    rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

    T=customMean(np.vstack(rows))  - Tbaseline
    YP.plot(T.transpose(),fig,ax,title=m);
    l += ['{}-{}'.format(i, i+interval)]
    
plt.legend(l);
plt.grid();
plt.title('Change in SNWD compared to {}'.format('{}-{}'.format(startYear, startYear+interval)));
ylabel('Change in SNWD (mm)');
plt.grid();

# Let's scale each SNWD graph by the maximum snow for the year. This way we can compare the build up and melt down of the snow between years (rather than the absolute snow change)

In [None]:
fig, ax = plt.subplots(figsize=(15,10));
YP=YearPlotter()

#interval = 20 # years between 
#startYear = 1949

dfSnow=df2.filter(df2.measurement==m).filter(df2.year < (startYear+interval)) 

rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

Tbaseline=customMean(np.vstack(rows))
Tbaseline /= Tbaseline.sum()
YP.plot(Tbaseline.transpose(),fig,ax,title=m)

l = ['{}-{}'.format(startYear, startYear+interval)]

for i in range(startYear+interval, 2000, interval):

    dfSnow=df2.filter(df2.measurement==m).filter(df2.year >= i).filter(df2.year < (i+interval))

    rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

    T=customMean(np.vstack(rows))  
    T /= T.sum()
    #T -= Tbaseline
    YP.plot(T.transpose(),fig,ax,title=m);
    l += ['{}-{}'.format(i, i+interval)]
    
plt.legend(l);
plt.grid();
plt.title('Normalized SNWD compared to {}'.format('{}-{}'.format(startYear, startYear+interval)));
ylabel('Normalized SNWD (% of total snow for year)');
plt.grid();
plt.savefig('NormalizedSnow.svg')

In [None]:
fig, ax = plt.subplots(figsize=(10,8));
YP=YearPlotter()

interval = 10 # years between 
#startYear = 1949

dfSnow=df2.filter(df2.measurement==m).filter(df2.year < (startYear+interval)) 

rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

Tbaseline=customMean(np.vstack(rows))
Tbaseline /= Tbaseline.sum()
#YP.plot(Tbaseline.transpose(),fig,ax,title=m)

l = [] #['{}-{}'.format(startYear, startYear+interval)]

for i in range(startYear+interval, 2000, interval):

    dfSnow=df2.filter(df2.measurement==m).filter(df2.year >= i).filter(df2.year < (i+interval))

    rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

    T=customMean(np.vstack(rows))  
    T /= T.sum()
    T -= Tbaseline
    YP.plot(T.transpose(),fig,ax,title=m);
    l += ['{}-{}'.format(i, i+interval)]
    
plt.legend(l);
plt.grid();
plt.title('Change in normalized SNWD compared to {}'.format('{}-{}'.format(startYear, startYear+interval)));
ylabel('Change in normalized SNWD');
plt.grid();
plt.savefig('NormalizedSnowChange.svg')

## Let's see how the "No Snow" part of the year changes

In [None]:
def findLongestRun(x):
    '''
    Searches a vector for the longest continuous sequence (e.g. 13,14,15,16).
    If there are multiple equal runs, then the first run is returned.

    e.g. 
    findLongestRun([4, 5, 6, 10,11,12,13, 123, 124, 125, 126])
    returns [3, 4] because 10-13 is the longest sequence
    
    findLongestRun([4, 5, 6, 7, 8, 10,11,12,13, 123, 124, 125, 126])
    returns [0, 5] because 4-8 is the longest sequence
    
    findLongestRun([-3,-2, -4, 123, 124, 125, 126, 127, 128, 10,11,12,13, 9123, 9124, 8125, 8126])
    returns [3,6] because 123-128 is the longest sequence
    '''
    
    indexofRuns = np.where(np.insert(np.diff(x),0,1) != 1)[0]
    indexofRuns = np.insert(np.insert(indexofRuns, len(indexofRuns), len(x)), 0, 0)
    
    runs = np.diff(indexofRuns)
    
    longestRun = np.argmax(runs)
    
    longestRunLength = np.max(runs)
    
    return [indexofRuns[longestRun], longestRunLength]

In [None]:
dfSnow=df2.filter(df2.measurement==m).filter(df2.year == 1949)

rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

yearsArray = [1949]
T = rows[0]
    
noSnowDays = np.where((T < 2) | (np.isnan(T)))[0]  # "Snow melted if < 2 mm depth"
[noSnowStart, noSnowLength] = findLongestRun(noSnowDays)

## Sanity check
# I've found 8 occurences between 1998 and 2010 where SNWD is reported 0 or nan for all 365 days
# This is most likely a problem with the station or the data. I doubt that the station
# actually had 365 days without a single mm of snowfall in northern Minnesota.
# So let's say that if there are more than 340 days without snow in a year, then 
# we don't count that in the analysis as it is probably an error.
if (noSnowLength < 340):
    noSnowPeriod = np.array([noSnowDays[noSnowStart], noSnowDays[(noSnowStart + noSnowLength - 1)]])

for i in range(1,np.shape(rows)[0]):
    
    T = rows[i]
    
    noSnowDays = np.where((T < 2) | (np.isnan(T)))[0]  # "Snow melted if < 2 mm depth"
    [noSnowStart, noSnowLength] = findLongestRun(noSnowDays)
    if (noSnowLength < 340):
        noSnowPeriod = np.array([noSnowDays[noSnowStart], noSnowDays[(noSnowStart + noSnowLength - 1)]])
        yearsArray.append(1949)
    
for i in range(1950,2012):
    
    dfSnow=df2.filter(df2.measurement==m).filter(df2.year == i)

    rows=dfSnow.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()

    for j in range(np.shape(rows)[0]):
    
        T = rows[j]

        noSnowDays = np.where((T < 2) | (np.isnan(T)))[0] # "Snow melted if < 2 mm depth"
        [noSnowStart, noSnowLength] = findLongestRun(noSnowDays)

        if (noSnowLength < 340):
            noSnowPeriod = np.vstack([noSnowPeriod, np.array([noSnowDays[noSnowStart], \
                                                             noSnowDays[(noSnowStart + noSnowLength - 1)]])])
            yearsArray.append(i)

In [None]:
a1949 = noSnowPeriod[np.where(np.array(yearsArray)==1949)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1949)[0]][:,0]

In [None]:
a1955 = noSnowPeriod[np.where(np.array(yearsArray)==1955)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1955)[0]][:,0]

In [None]:
a1989 = noSnowPeriod[np.where(np.array(yearsArray)==1989)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1989)[0]][:,0]

In [None]:
a1999 = noSnowPeriod[np.where(np.array(yearsArray)==1999)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1999)[0]][:,0]

In [None]:
a2009 = noSnowPeriod[np.where(np.array(yearsArray)==2009)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==2009)[0]][:,0]

In [None]:
a1979 = noSnowPeriod[np.where(np.array(yearsArray)==1979)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1979)[0]][:,0]

In [None]:
a1969 = noSnowPeriod[np.where(np.array(yearsArray)==1969)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1969)[0]][:,0]

In [None]:
a1959 = noSnowPeriod[np.where(np.array(yearsArray)==1959)[0]][:,1] - \
noSnowPeriod[np.where(np.array(yearsArray)==1959)[0]][:,0]

In [None]:
plt.hist(a2009, alpha=0.75, label='2009', normed=True);
plt.hist(a1949, alpha=0.75, color='r', label='1949', normed=True);

plt.legend();

In [None]:
plt.hist(a2009, alpha=0.75, label='2009');
for i in range(1949,1955):
    
    s1 = noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,1] - \
         noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,0]
    plt.hist(s1, alpha=0.75, label=str(i));
    

plt.legend();

In [None]:

season = np.zeros([len(range(1949,2011)),4])
j = 0
for i in range(1949,2011):
    lengths = noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,1] - \
              noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,0]
    season[j,0] = np.mean(lengths)
    season[j,1] = np.var(lengths) #np.std(lengths)
    season[j,2] = np.mean(noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,0])
    season[j,3] = np.var(noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,0]) #np.std(noSnowPeriod[np.where(np.array(yearsArray)==i)[0]][:,0])
    j += 1

In [None]:
from scipy.stats import linregress

In [None]:
slope, intercept, r_value, p_value, std_err = linregress(range(len(season[:,0])),season[:,0])
print ('Mean Length of "no snow" season, p_value = {:.5f}, r2 = {:.3f}'.format(p_value, r_value**2))
if (p_value < 0.05):
    print('Statistically significant')

In [None]:
slope, intercept, r_value, p_value, std_err = linregress(range(len(season[:,1])),season[:,1])
print ('Mean Length Variance of "no snow" season, p_value = {:.5f}, r2 = {:.3f}'.format(p_value, r_value**2))
if (p_value < 0.05):
    print('Statistically significant')

In [None]:
slope, intercept, r_value, p_value, std_err = linregress(range(len(season[:,2])),season[:,2])
print ('Mean Start Day of "no snow" season, p_value = {:.5f}, r2 = {:.3f}'.format(p_value, r_value**2))
if (p_value < 0.05):
    print('Statistically significant')

In [None]:
slope, intercept, r_value, p_value, std_err = linregress(range(len(season[:,3])),season[:,3])
print ('Mean Start Day Variance of "no snow" season, p_value = {:.5f}, r2 = {:.3f}'.format(p_value, r_value**2))
if (p_value < 0.05):
    print('Statistically significant')

In [None]:
plt.errorbar(range(1949, 2011), season[:,0], yerr=season[:,1]);
plt.xlabel('Year');
plt.ylabel('"No snow" season mean and variance');
plt.title('"No snow" Season Variance Over Time');
plt.savefig('SNWDVarianceOverTime.svg');

In [None]:
plt.errorbar(range(1949, 2011), season[:,2], yerr=season[:,3]);

In [None]:
plt.plot(range(1949, 2011), season[:,1], 'o');

In [None]:
plt.hist(season[:,2]);  # First day of "no snow" season

In [None]:
plt.plot(range(1949, 2011), season[:,3], 'o');

In [None]:
# Sort entries by increasing values of ers_3
df3=df3.sort(df3.res_3,ascending=True)

In [None]:
def plot_decomp(row,Mean,v,fig=None,ax=None,Title=None,interactive=False):
    """Plot a single reconstruction with an informative title

    :param row: SparkSQL Row that contains the measurements for a particular station, year and measurement. 
    :param Mean: The mean vector of all measurements of a given type
    :param v: eigen-vectors for the distribution of measurements.
    :param fig: a matplotlib figure in which to place the plot
    :param ax: a matplotlib axis in which to place the plot
    :param Title: A plot title over-ride.
    :param interactive: A flag that indicates whether or not this is an interactive plot (widget-driven)
    :returns: a plotter returned by recon_plot initialization
    :rtype: recon_plot

    """
    target=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
    if Title is None:
        Title='%s / %d    %s'%(row['station'],row['year'],row['measurement'])
    eigen_decomp=Eigen_decomp(range(1,366),target,Mean,v)
    plotter=recon_plot(eigen_decomp,year_axis=True,fig=fig,ax=ax,interactive=interactive,Title=Title)
    return plotter

def plot_recon_grid(rows,column_n=4, row_n=3, figsize=(15,10)):
    """plot a grid of reconstruction plots

    :param rows: Data rows (as extracted from the measurements data-frame
    :param column_n: number of columns
    :param row_n:  number of rows
    :param figsize: Size of figure
    :returns: None
    :rtype: 

    """
    fig,axes=plt.subplots(row_n,column_n, sharex='col', sharey='row',figsize=figsize);
    k=0
    for i in range(row_n):
        for j in range(column_n):
            row=rows[k]
            k+=1
            #_title='%3.2f,r1=%3.2f,r2=%3.2f,r3=%3.2f'\
            #        %(row['res_mean'],row['res_1'],row['res_2'],row['res_3'])
            #print i,j,_title,axes[i,j]
            plot_decomp(row,Mean,v,fig=fig,ax=axes[i,j],interactive=False)
    return None


#### Different things to try
The best/worst rows in terms of res_mean,res_1, res_2, res_3

The rows with the highest lowest levels of coeff1, coeff2, coeff3, when the corresponding residue is small.

In [None]:
df4=df3.filter(df3.res_2<0.4).sort(df3.coeff_2)
rows=df4.take(12)
df4.select('coeff_2','res_2').show(4)

In [None]:
plot_recon_grid(rows)

In [None]:
df5=df3.filter(df3.res_2<0.4).sort(df3.coeff_2,ascending=False)
rows=df5.take(12)
df5.select('coeff_2','res_2').show(4)

In [None]:
plot_recon_grid(rows)

## Interactive plot of reconstruction

Following is an interactive widget which lets you change the coefficients of the eigen-vectors to see the effect on the approximation.
The initial state of the sliders (in the middle) corresponds to the optimal setting. You can zero a positive coefficient by moving the slider all the way down, zero a negative coefficient by moving it all the way up.

In [None]:
row=rows[0]
target=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
eigen_decomp=Eigen_decomp(None,target,Mean,v)
total_var,residuals,reductions,coeff=eigen_decomp.compute_var_explained()
res=residuals[1]
print 'residual normalized norm  after mean:',res[0]
print 'residual normalized norm  after mean + top eigs:',res[1:]

plotter=recon_plot(eigen_decomp,year_axis=True,interactive=True)
display(plotter.get_Interactive())

### What is the distribution of the residuals and the coefficients?

To answer this question we extract all of the values of `res_3` which is the residual variance after the Mean and the 
first two Eigen-vectors have been subtracted out. We rely here on the fact that `df3` is already sorted according to `res_3`

In [None]:
# A function for plotting the CDF of a given feature
def plot_CDF(feat):
    rows=df3.select(feat).sort(feat).collect()
    vals=[r[feat] for r in rows]
    P=np.arange(0,1,1./(len(vals)))
    #vals=[vals[0]]+vals
    plot(vals,P)
    title('cumulative distribution of '+feat)
    ylabel('number of instances')
    xlabel(feat)
    grid()

In [None]:
plot_CDF('res_2')

In [None]:
plot_CDF('coeff_2')

In [None]:
filename=data_dir+'/decon_'+file_index+'_'+m+'.parquet'
!rm -rf $filename
df3.write.parquet(filename)

In [None]:
!du -sh $data_dir/*.parquet