### Visualizing weather data

In [1]:
%%time
import pandas as pd
import numpy as np
import sklearn as sk
import urllib
import math
%pylab inline

#import findspark
#findspark.init()

from pyspark import SparkContext
#sc.stop()
sc = SparkContext(pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStatistics.py'])

from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql
sqlContext = SQLContext(sc)

Populating the interactive namespace from numpy and matplotlib
CPU times: user 812 ms, sys: 72 ms, total: 884 ms
Wall time: 14.1 s


In [46]:
from pyspark.sql import Row

In [3]:
import numpy as np
from lib.numpy_pack import packArray,unpackArray
from lib.spark_PCA import computeCov
from lib.computeStatistics import computeOverAllDist, STAT_Descriptions

### Read data through open bucket

In [4]:
state='NY'
EMR=True
if not EMR:
    data_dir='../../Data/Weather'

    tarname=state+'.tgz'
    parquet=state+'.parquet'

    !rm -rf $data_dir/$tarname

    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s > %s/%s"%(tarname,data_dir,tarname)
    print(command)
    !$command
    !ls -lh $data_dir/$tarname
    cur_dir,=!pwd
    %cd $data_dir
    !tar -xzf $tarname
    !du ./$parquet
    %cd $cur_dir

    #read statistics
    filename='STAT_%s.pickle'%state
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s.gz > %s/%s.gz"%(filename,data_dir,filename)
    print(command)
    !$command
    
    filename='US_stations.tsv.gz'
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/Info/%s > %s/%s"%(filename,data_dir,filename)
    print(command)
    !$command
    filename_no_gz = filename[:-3]
    !gunzip -f $data_dir/$filename
    !ls -lh $data_dir/US_stations*

### Read data when on EMR

In [6]:
%%time
if EMR:
    !hdfs dfs -ls /weather/

    stations_df=sqlContext.read.parquet('/weather/US_stations.parquet')
    stations_df.show(3)

    weather_df=sqlContext.read.parquet('/weather/US_weather.parquet')
    weather_df.show(2)
    

Found 2 items
drwxr-xr-x   - hadoop hadoop          0 2018-04-09 04:42 /weather/US_stations.parquet
drwxr-xr-x   - hadoop hadoop          0 2018-04-09 04:42 /weather/US_weather.parquet
+-----------+----------+--------+---------+---------+-----+----------------+
|    Station|dist_coast|latitude|longitude|elevation|state|            name|
+-----------+----------+--------+---------+---------+-----+----------------+
|USC00341900|   739.956|    36.3| -96.4667|    242.3|   OK|       CLEVELAND|
|USC00428114|    908.22|    40.1|-111.6667|   1409.1|   UT|SPANISH FORK 1 S|
|USC00165926|   23.8801| 29.7853| -90.1158|      0.9|   LA|   MARRERO 9 SSW|
+-----------+----------+--------+---------+---------+-----+----------------+
only showing top 3 rows

+-----------+-----------+----+--------------------+
|    Station|Measurement|Year|              Values|
+-----------+-----------+----+--------------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|
|CA001126150|       PRCP|1942|[00 00 80 4A 

In [8]:
%%time

jdf=weather_df.join(stations_df,on='Station',how='left')
print(jdf.count())
jdf.show(2)

3259494
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|    Station|Measurement|Year|              Values|dist_coast|latitude|longitude|elevation|state|             name|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
|CA001126150|       PRCP|1942|[00 00 80 4A 00 0...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
only showing top 2 rows

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 2.3 s


### Smoothing by convolving with gaussian window

In [69]:
# %load lib/numpy_pack.py
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())


def unpackArray(x,data_type=np.int16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

In [70]:
from astropy.convolution import convolve
from scipy import signal
from copy import deepcopy
#using astrophy.convolution.convolve and not scipy.signal.convolve because the first can handle nans.

order=101
std=20
window = signal.gaussian(order, std=std)
window/=sum(window)

def Smoother(item):
    key,List = item
    
    sorted_List=sorted(List,key=lambda row:row['Year'])
    L=[(Row['Year'],unpackArray(Row['Values'],np.float16)) for Row in sorted_List]
    
    orig=np.stack([V[1] for V in L])
    print('orig.shape=',orig.shape)
    orig_shape=orig.shape
    orig=orig.flatten()
    smoothed = convolve(orig, window)
    smoothed=np.reshape(smoothed,orig_shape)

    #create a list of Rows with the smoothed 
    new_L = []
    new_name = List[0]['Measurement']+'_s%d'%std
    for i in range(len(List)):
        new_row = List[i].asDict()
        new_row['Measurement']=new_name
        new_row['Values']=packArray(smoothed[i,:])
        new_L.append(Row(**new_row))

    return new_L


In [71]:
new_L = Smoother(item)
new_L[0]

orig.shape= (39, 365)


Row(Measurement='TMAX_s20', Station='CA007016902', Values=bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~

In [72]:
%%time
keyVal=jdf.rdd.map(lambda row:((row['Station'],row['Measurement']),[row]))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 36.5 µs


In [73]:
%%time
Reduced=keyVal.reduceByKey(lambda x,y:x+y)
Smoothed=Reduced.flatMap(Smoother)

CPU times: user 8 ms, sys: 0 ns, total: 8 ms
Wall time: 21.9 ms


In [65]:
from time import time

In [74]:
#t0=time()
X=Smoothed.first()
#t1=time()
#X

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 33.0 failed 4 times, most recent failure: Lost task 0.3 in stage 33.0 (TID 155, ip-10-129-239-229.ec2.internal, executor 10): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/worker.py", line 216, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/worker.py", line 58, in read_command
    command = serializer._read_with_length(file)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'astropy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$1.apply(PythonRDD.scala:141)
	at org.apache.spark.api.python.PythonRDD$$anonfun$1.apply(PythonRDD.scala:141)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1750)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1738)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1737)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1737)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1920)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1909)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:141)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/worker.py", line 216, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/worker.py", line 58, in read_command
    command = serializer._read_with_length(file)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/serializers.py", line 170, in _read_with_length
    return self.loads(obj)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1523247921717_0004/container_1523247921717_0004_01_000011/pyspark.zip/pyspark/serializers.py", line 559, in loads
    return pickle.loads(obj, encoding=encoding)
ImportError: No module named 'astropy'

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$class.foreach(Iterator.scala:893)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable$class.$plus$plus$eq(Growable.scala:59)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:104)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:48)
	at scala.collection.TraversableOnce$class.to(TraversableOnce.scala:310)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toBuffer(TraversableOnce.scala:302)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce$class.toArray(TraversableOnce.scala:289)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.api.python.PythonRDD$$anonfun$1.apply(PythonRDD.scala:141)
	at org.apache.spark.api.python.PythonRDD$$anonfun$1.apply(PythonRDD.scala:141)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:2067)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [13]:
len(item[1])

19

In [14]:
item[0]

('USC00330195', 'TMAX')

In [14]:
from time import time
# get all measurements for a particular year and a particular station
measurement='PRCP'
Query_template="""
SELECT *
FROM jdf 
WHERE Measurement='%s'
AND Station='%s'
ORDER BY YEAR"""

for station in stations[:2]:
    t0=time()
    Query=Query_template%(measurement,station)

    pdf=sqlContext.sql(Query).toPandas()
    t1=time()
    smoothed_pdf=Smoother(pdf)
    t2=time()
    smoothed_df= sqlContext.createDataFrame(smoothed_pdf)
    jdf=jdf.union(smoothed_df)
    t3=time()
    print('Station=%s, rows=%d, prep=%5.2f,compute=%5.2f,cleanup=%f5.2,total=%f5.2'
          %(station,pdf.shape[0],t1-t0,t2-t1,t3-t2,t3-t0))

Station=CA006149625, rows=144, prep=28.42,compute= 0.03,cleanup=0.0764225.2,total=28.5226245.2
Station=USC00144559, rows=141, prep= 8.84,compute= 0.03,cleanup=0.0378165.2,total=8.9030835.2


In [None]:
jdf.count()

In [None]:
!ls ../../Data/Weather/

In [None]:
outfilename='../../Data/Weather/Joined_smoothed_PRCP.parquet'
jdf.write.save(outfilename)

In [None]:
!du -sh $outfilename

In [None]:
print(pdf.columns)
sdf = sqlContext.createDataFrame(pdf)
sdf.schema

In [None]:
# 'smoothed_%s'%(station),
sdf.count()

In [None]:
jdf.count()

In [None]:
jdf=jdf.union(sdf)
jdf.count()

## BinaryType not supported  by pandas_udf
Running the following code: 
```python
import pyspark.sql.functions as sqlf
import pyspark
import pyarrow
pyspark.__version__  (2.3.0)

from pyspark.sql.functions import pandas_udf, PandasUDFType
def Smoother(orig_pdf):
    return orig_pdf

### Offending command
smoother_udf=pandas_udf(Smoother,df.select(['Station','Year','Values']).schema, PandasUDFType.GROUPED_MAP) 

X=df.groupby("Station").apply(smoother_udf)
X.show()
```
Generates the following error message
```
NotImplementedError: Invalid returnType with grouped map Pandas UDFs: StructType(List(StructField(Station,StringType,true),StructField(Year,IntegerType,true),StructField(Values,BinaryType,true))) is not supported
```

Works find if only ('Station','Year') are used

In [None]:
orig_df.schema

In [None]:
from lib.YearPlotter import YearPlotter
fig, ax = plt.subplots(figsize=(10,7));
YP=YearPlotter()
YP.plot(smoothed[110:120,:].transpose(),fig,ax,title='smoothed %s for %s'%(measurement,stat));
plt.savefig('percipitation.png')
#title('A sample of graphs');

In [None]:
fig, ax = plt.subplots(figsize=(10,7));
YP=YearPlotter()
i=85
factor=5
pair=np.stack([orig[i,:],smoothed[i,:]*factor])
pair.shape

YP.plot(pair.transpose(),fig,ax,title='smoothed %s for %s'%(measurement,stat));

In [None]:
from scipy import signal
from astropy.convolution import convolve
window = signal.gaussian(81, std=20)

window/=sum(window)

In [None]:
P=T[3,:]
P[10:30]=np.nan
f=filtered = convolve(P, window)
print(len(f))
plot(f)
plot(P)

### Distribution of missing observations
The distribution of missing observations is not uniform throughout the year. We visualize it below.

In [None]:
from MultiPlot import *                
def plot_valid(m,fig,axis):
    valid_m=STAT[m]['NE']
    YP.plot(valid_m,fig,axis,title='valid-counts '+m)
    

In [None]:
plot_pair(['TMIN','TMAX'],plot_valid)

In [None]:
plot_pair(['TOBS','PRCP'],plot_valid)

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_valid)

### Plots of mean and std of observations

In [None]:
def plot_mean_std(m,fig,axis):
    scale=1.
    temps=['TMIN','TMAX','TOBS']
    percipitation=['PRCP','SNOW','SNWD']
    _labels=['mean+std','mean','mean-std']
    if (m in temps or m=='PRCP'):
        scale=10.
    mean=STAT[m]['Mean']/scale
    std=np.sqrt(STAT[m]['Var'])/scale
    graphs=np.vstack([mean+std,mean,mean-std]).transpose()
    YP.plot(graphs,fig,axis,labels=_labels,title='Mean+-std   '+m)
    if (m in temps):
        axis.set_ylabel('Degrees Celsius')
    if (m in percipitation):
        axis.set_ylabel('millimeter')



In [None]:
plot_pair(['TMIN','TMAX'],plot_mean_std)

In [None]:
plot_pair(['TOBS','PRCP'],plot_mean_std)

In [None]:
plot_single('TOBS',plot_mean_std,'r_figures/TOBS.png')

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_mean_std)

In [None]:
plot_single('SNOW',plot_mean_std,'r_figures/SNOW.png')

In [None]:
plot_single('SNWD',plot_mean_std,'r_figures/SNWD.png')

### plotting top 3 eigenvectors

In [None]:
def plot_eigen(m,fig,axis):
    EV=STAT[m]['eigvec']
    YP.plot(EV[:,:3],fig,axis,title='Top Eigenvectors '+m)

In [None]:
plot_pair(['TMIN','TMAX'],plot_eigen)

In [None]:
plot_pair(['TOBS','PRCP'],plot_eigen)

In [None]:
plot_pair(['SNOW', 'SNWD'],plot_eigen)

### Script for plotting percentage of variance explained

In [None]:
def pltVarExplained(j):
    subplot(1,3,j)
    EV=STAT[m]['eigval']
    k=5
    L=([0,]+list(cumsum(EV[:k])))/sum(EV)
    #print m,L
    plot(L)
    title('Percentage of Variance Explained for '+ m)
    ylabel('Percentage of Variance')
    xlabel('# Eigenvector')
    grid()
    

In [None]:
f=plt.figure(figsize=(15,4))
j=1
for m in ['TMIN', 'TOBS', 'TMAX']: #,
    pltVarExplained(j)
    j+=1
f.savefig('r_figures/VarExplained1.png')

In [None]:
f=plt.figure(figsize=(15,4))
j=1
for m in ['SNOW', 'SNWD', 'PRCP']:
    pltVarExplained(j)
    j+=1 
f.savefig('r_figures/VarExplained2.png')

In [None]:
#sc.stop()