# Reconstruction using top eigen-vectors

## Load the required libraries

In [1]:
# Enable automiatic reload of libraries
%load_ext autoreload
%autoreload 2 # means that all modules are reloaded before every command

In [2]:
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [3]:
import sys
sys.path.append('./lib')
from import_modules import import_modules,modules
import_modules(modules)

from numpy_pack import packArray,unpackArray
from spark_PCA import computeCov
from computeStats import computeOverAllDist, STAT_Descriptions
from recon_plot import recon_plot
from Eigen_decomp import Eigen_decomp
from YearPlotter import YearPlotter

    pandas as    pd 	version=0.19.2 	required version>=0.19.2
     numpy as    np 	version=1.12.0 	required version>=1.12.0
   sklearn as    sk 	version=0.18.1 	required version>=0.18.1
    urllib as urllib 	version=1.17 	required version>=1.17
module pyspark has no version


In [4]:
from pyspark import SparkContext
#sc.stop()
sc = SparkContext(master="local[3]",pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStats.py','lib/recon_plot.py','lib/Eigen_decomp.py'])

from pyspark import SparkContext
from pyspark.sql import *
sqlContext = SQLContext(sc)

KeyboardInterrupt: 

## Read Statistics File

In [None]:
data_dir='../../Data/Weather'
file_index='BBBSBBBB'

In [None]:
from pickle import load

#read statistics
filename=data_dir+'/STAT_%s.pickle'%file_index
STAT,STAT_Descriptions = load(open(filename,'rb'))
measurements=STAT.keys()
print 'keys from STAT=',measurements

## Read data file into a spark DataFrame

In [None]:
#read data
filename=data_dir+'/US_Weather_BBBSBBBB.csv'
List=load(open(filename,'rb'))
print 'length of List=',len(List)

df=sqlContext.createDataFrame(List)
print df.count()
df.show(5)

## Look at some examples

In [None]:
m='SNWD'
sqlContext.registerDataFrameAsTable(df,'weather')
Query="SELECT * FROM weather\n\tWHERE measurement='%s'"%(m)
print Query
df1 = sqlContext.sql(Query)
print df1.count(),'rows'
df1.show(2)

### Create a matrix with all of the series

In [None]:
rows=df1.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()
T=np.vstack(rows)
T=T # scaling to make the temperature be in centingrates
shape(T)

### Plot two time series
`SNWD` stands for `snow-depth`, which explains why it is zero during the summer

In [None]:
from YearPlotter import YearPlotter
fig, ax = plt.subplots(figsize=(6,4));
YP=YearPlotter()
YP.plot(T[16:18].transpose(),fig,ax,title=m)

### Plot Reconstructions

Construct approximations of a time series using the mean and the $k$ top eigen-vectors
First, we plot the mean and the top $k$ eigenvectors

In [None]:
k=3
EigVec=np.matrix(STAT[m]['eigvec'][:,:k])
Mean=STAT[m]['Mean']
fig=plt.figure(figsize=(6,4))
ax=fig.add_axes([0,0,1,.5])
YearPlotter().plot(Mean,fig,ax,label='Mean',title=m)
ax=fig.add_axes([0,.5,1,.5])
YearPlotter().plot(EigVec,fig,ax,title=m,labels=['eig'+str(i+1) for i in range(k)])

### plot the percent of residual variance on average

In [None]:
#  x=0 in the graphs below correspond to the fraction of the variance explained by the mean alone
#  x=1,2,3,... are the residuals for eig1, eig1+eig2, eig1+eig2+eig3 ...
figure(figsize=[10,4]); subplot(121)
eigvals=STAT[m]['eigval']; eigvals/=sum(eigvals); cumvar=cumsum(eigvals)
plot(1-cumvar[:10]); grid(); title('average residuals')
subplot(122)
v=[np.array(EigVec[:,i]).flatten() for i in range(shape(EigVec)[1])]
target=T[16,:]
eigen_decomp=Eigen_decomp(None,target,Mean,v)
total_var,residuals,reductions,_coeff=eigen_decomp.compute_var_explained()
plot(list(residuals[1])); grid(); title('single series residuals');

## Interactive plot of reconstruction

Following is an interactive widget which lets you change the coefficients of the eigen-vectors to see the effect on the approximation.
The initial state of the sliders (in the middle) corresponds to the optimal setting. You can zero a positive coefficient by moving the slider all the way down, zero a negative coefficient by moving it all the way up.

In [None]:
# %load lib/YearPlotter.py
from datetime import date
from numpy import shape
from matplotlib.dates import MonthLocator, DateFormatter
class YearPlotter:
    def __init__(self):
        start=365*1+1
        self.dates=[date.fromordinal(i) for i in range(start,start+365)]
        self.monthsFmt = DateFormatter("%b")
        self.months = MonthLocator(range(1, 13), bymonthday=1, interval=10)
        #self.i=0

    def plot(self,T,fig,ax,label='',labels=None,title=None):
        #print self.i,'fig=',fig,'ax=',ax
        #self.i+=1
        shp=shape(T)
        if shp[0] != 365:
            raise ValueError("First dimension of T should be 365. Shape(T)="+str(shape(T)))
        if len(shp)==1:
            #print 'one'
            ax.plot(self.dates,T,label=label);
        else:
            #print 'more than 1'
            if labels is None:
                labels=[str(i) for i in range(shp[1])]
            for i in range(shp[1]):
                ax.plot(self.dates,T[:,i],label=labels[i])
        ax.xaxis.set_major_locator(self.months)
        ax.xaxis.set_major_formatter(self.monthsFmt)
        if not title is None:
            ax.set_title(title)
        # rotate and align the tick labels so they look better
        fig.autofmt_xdate()
        ax.grid()
        ax.legend()


In [None]:
%%writefile lib/recon_plot.py
import numpy as np
from YearPlotter import YearPlotter
from Eigen_decomp import Eigen_decomp
import matplotlib.pyplot as plt
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

class recon_plot:
    """A class for creating an interactive demonstration of approximating 
    a function with an orthonormal set of function"""
    def __init__(self,eigen_decomp,year_axis=False,fig=None,ax=None,interactive=True,Title=None):
        """ 
        Initialize the plot widget
        :param: eigen_decomp: An Eigen_Decomp object
        :param: year_axis: set to true if X axis should correspond to the months of the year.

        """
        self.eigen_decomp=eigen_decomp
        self.interactive=interactive
        self.fig=fig
        self.ax=ax
        self.Title=Title
        
        self.year_axis=year_axis
        self.yearPlotter=None
        if year_axis:
            self.yearPlotter=YearPlotter()
        self.plot_combination(**self.eigen_decomp.coeff)
        return None

    def get_widgets(self):
        """return the slider widget that are to be used

        :returns: widget_list: the list of widgets in order
                  widget_dict: a dictionary of the widget to be used in `interact

        :todo: make the sliders smaller: http://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Styling.html
        """
        coeff=self.eigen_decomp.C
        widge_dict={}
        widge_list=[]
        for i in range(self.eigen_decomp.n):
            if coeff[i]>0:
                r=[0,coeff[i]*2]
            else:
                r=[coeff[i]*2,0]

            widge_list.append(widgets.FloatSlider(min=r[0],max=r[1],step=coeff[i]/10.,\
                                                  value=coeff[i],orientation='vertical',decription='v'+str(i)))
            widge_dict['c'+str(i)]=widge_list[-1]

        return widge_list,widge_dict

    def plot(self,y,label=''):
        if self.year_axis:
            self.yearPlotter.plot(y,self.fig,self.ax,label=label)
        else:
            self.ax.plot(self.eigen_decomp.x,y,label=label);

    def plot_combination(self,**coeff):
        """the plotting function that is called by `interactive`
           generates the plot according the the parameters set by the sliders

        :returns: None
        """
        
        if self.interactive or self.fig is None:
            self.fig=plt.figure(figsize=(8,6))
            self.ax=self.fig.add_axes([0,0,1,1])

        A=self.eigen_decomp.mean
        self.plot(A,label='mean')

        for i in range(self.eigen_decomp.n):
            g=self.eigen_decomp.v[i]*coeff['c'+str(i)]
            A=A+g
            self.plot(A,label='c'+str(i))
        self.plot(self.eigen_decomp.f,label='target')
        self.ax.grid(figure=self.fig)        
        self.ax.legend()
        self.ax.set_title(self.Title)
        return None
    


In [None]:
import numpy as np
from ipywidgets import interactive,widgets
eigen_decomp=Eigen_decomp(x,f,np.zeros(len(f)),v)
plotter=recon_plot(eigen_decomp,year_axis=True);
widge_list,widge_dict = plotter.get_widgets()
interactive(plotter.plot_combination, **widge_dict)
widgets.VBox([widgets.HBox(widge_list)])


In [None]:
from recon_plot import recon_plot
from Eigen_decomp import Eigen_decomp
from YearPlotter import YearPlotter
import numpy as np
from ipywidgets import interactive,widgets
#from recon_plot import recon_plot
plotter=recon_plot(eigen_decomp,year_axis=True)#,interactive=True,Title='something')
widge_list,widge_dict = plotter.get_widgets()
interactive(plotter.plot_combination, **widge_dict)
widgets.VBox([widgets.HBox(widge_list)])

## Process whole dataframe to find best and worse residuals

In [None]:
Query="SELECT * FROM weather\n\tWHERE measurement='%s'"%(m)
print Query
df1 = sqlContext.sql(Query)
print df1.count(),'rows'
df1.show(2)
rows=df1.rdd.map(lambda row:unpackArray(row['vector'],np.float16)).collect()
T=np.vstack(rows)
shape(T)

### Add to each row in the dataframe a residual values 
Residuals are after subtracting in sequence: the mean, the projection on the first eigen-vector the projection on the second eigen-vector etc.

`decompose(row)` axtracts the series from the row, computes the residuals and constructs a new row that is reassembled into a dataframe.

A more efficient solution would be to use UDFs (user defined functions) but I could not make it work.

In [None]:
def decompose(row):
    Series=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
    recon=Eigen_decomp(None,Series,Mean,v);
    total_var,residuals,reductions,coeff=recon.compute_var_explained()
    #print coeff
    residuals=[float(r) for r in residuals[1]]
    coeff=[float(r) for r in coeff[1]]
    D=row.asDict()
    D['total_var']=float(total_var[1])
    D['res_mean']=residuals[0]
    for i in range(1,len(residuals)):
        D['res_'+str(i)]=residuals[i]
        D['coeff_'+str(i)]=coeff[i-1]
    return Row(**D)  

In [None]:
#import numpy
rdd2=df1.rdd.map(decompose)
df2=sqlContext.createDataFrame(rdd2)
df2.select('res_mean','res_1','res_2','res_3','total_var','coeff_1','coeff_2','coeff_3').show(3)

In [None]:
def plot_decomp(row,Mean,v,fig=None,ax=None,Title=None):
    target=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
    eigen_decomp=Eigen_decomp(None,target,Mean,v)
    recon_plot(eigen_decomp,year_axis=True,fig=fig,ax=ax,interactive=False,Title=Title)


In [None]:
row=df2.first()
plot_decomp(row,Mean,v,Title='title44')

In [None]:
print 'before filter',df2.count()
df3=df2.filter(df2.total_var!=0)
print 'after filter',df3.count()

In [None]:
def plot_recon_grid(rows,column_n=4, row_n=3, figsize=(15,10)):
    fig,axes=plt.subplots(row_n,column_n, sharex='col', sharey='row',figsize=figsize);
    k=0
    for i in range(row_n):
        for j in range(column_n):
            row=rows[k]
            k+=1
            plot_decomp(row,Mean,v,fig=fig,ax=axes[i,j],Title=str(row['res_3']))

In [None]:
df3=df3.sort(df3.res_3,ascending=True)
rows=df3.take(12)
plot_recon_grid(rows)

In [None]:
df3=df3.sort(df3.res_3,ascending=False)
rows=df3.take(12)
plot_recon_grid(rows)

In [None]:
res3=df3.select('res_3').collect()

In [None]:
R3=[r['res_3'] for r in res3]

In [None]:
len(R3)

In [None]:
plot(R3[10:])

In [None]:
row=rows[0]
Series=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
recon=Eigen_decomp(None,Series,Mean,v);
recon.compute_var_explained()

In [None]:
print df3.count(),
df4=df3.filter(df3.res_3<0.5)
print '->',df4.count()

In [None]:
c=df4.select('coeff_1').collect()
c1=[x['coeff_1'] for x in c]
c=df4.select('coeff_2').collect()
c2=[x['coeff_2'] for x in c]
c1[:4],c2[:4]

In [None]:
figure(figsize=(8,8))
scatter(c1,c2,marker='.')
xlabel('coeff 1')
ylabel('coeff 2')
grid()

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType,NumericType,DataType,FloatType,DoubleType

In [None]:
k=3
m='SNWD'
EigVec=np.matrix(STAT[m]['eigvec'][:,:k])
v=[np.array(EigVec[:,i]).flatten() for i in range(shape(EigVec)[1])]
Mean=STAT[m]['Mean']
fig=plt.figure(figsize=(8,6))
ax=fig.add_axes([0,0,1,1])
YearPlotter().plot(EigVec,fig,ax,title='snow depth',labels=['eig'+str(i+1) for i in range(k)])

In [None]:
from ipywidgets import interactive,widgets
from recon_plot import recon_plot

rows=df3.tail(20)

In [None]:
row=rows[2]
target=np.array(unpackArray(row.vector,np.float16),dtype=np.float64)
eigen_decomp=Eigen_decomp(None,target,Mean,v)
plotter=recon_plot(eigen_decomp,year_axis=True)
widge_list,widge_dict = plotter.get_widgets()
interactive(plotter.plot_combination, **widge_dict)
widgets.VBox([widgets.HBox(widge_list)])

In [None]:
for m in ['TMIN', 'TOBS', 'TMAX', 'SNOW', 'SNWD', 'PRCP']:
    print 'Reconstruction Plots for '+ m
    create_reconstructions(m)