### Smoothing Measurement sequences

In [2]:
%%time
import pandas as pd
import numpy as np
import sklearn as sk
import urllib
import math
%pylab inline

#import findspark
#findspark.init()

from pyspark import SparkContext
#sc.stop()
sc = SparkContext(pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStatistics.py'])

from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql
sqlContext = SQLContext(sc)

Populating the interactive namespace from numpy and matplotlib
CPU times: user 772 ms, sys: 88 ms, total: 860 ms
Wall time: 15.1 s


In [3]:
from pyspark.sql import Row

In [4]:
import numpy as np
from lib.numpy_pack import packArray,unpackArray
from lib.spark_PCA import computeCov
from lib.computeStatistics import computeOverAllDist, STAT_Descriptions

### Read data through open bucket

In [149]:
state='NY'
EMR=True
if not EMR:
    data_dir='../../Data/Weather'

    tarname=state+'.tgz'
    parquet=state+'.parquet'

    !rm -rf $data_dir/$tarname

    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s > %s/%s"%(tarname,data_dir,tarname)
    print(command)
    !$command
    !ls -lh $data_dir/$tarname
    cur_dir,=!pwd
    %cd $data_dir
    !tar -xzf $tarname
    !du ./$parquet
    %cd $cur_dir

    #read statistics
    filename='STAT_%s.pickle'%state
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s.gz > %s/%s.gz"%(filename,data_dir,filename)
    print(command)
    !$command
    
    filename='US_stations.tsv.gz'
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/Info/%s > %s/%s"%(filename,data_dir,filename)
    print(command)
    !$command
    filename_no_gz = filename[:-3]
    !gunzip -f $data_dir/$filename
    !ls -lh $data_dir/US_stations*

### Read data when on EMR

In [6]:
%%time
if EMR:
    !hdfs dfs -ls /weather/

    stations_df=sqlContext.read.parquet('/weather/US_stations.parquet')
    stations_df.show(3)

    weather_df=sqlContext.read.parquet('/weather/US_weather.parquet')
    weather_df.show(2)
    

Found 2 items
drwxr-xr-x   - hadoop hadoop          0 2018-04-18 18:36 /weather/US_stations.parquet
drwxr-xr-x   - hadoop hadoop          0 2018-04-18 18:36 /weather/US_weather.parquet
+-----------+----------+--------+---------+---------+-----+----------------+
|    Station|dist_coast|latitude|longitude|elevation|state|            name|
+-----------+----------+--------+---------+---------+-----+----------------+
|USC00341900|   739.956|    36.3| -96.4667|    242.3|   OK|       CLEVELAND|
|USC00428114|    908.22|    40.1|-111.6667|   1409.1|   UT|SPANISH FORK 1 S|
|USC00165926|   23.8801| 29.7853| -90.1158|      0.9|   LA|   MARRERO 9 SSW|
+-----------+----------+--------+---------+---------+-----+----------------+
only showing top 3 rows

+-----------+-----------+----+--------------------+
|    Station|Measurement|Year|              Values|
+-----------+-----------+----+--------------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|
|CA001126150|       PRCP|1942|[00 00 80 4A 

In [154]:
weather_df.count()

3259494

In [7]:
%%time

jdf=weather_df.join(stations_df,on='Station',how='left')
print(jdf.count())
jdf.show(2)

3259494
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|    Station|Measurement|Year|              Values|dist_coast|latitude|longitude|elevation|state|             name|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
|CA001126150|       PRCP|1942|[00 00 80 4A 00 0...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
only showing top 2 rows

CPU times: user 4 ms, sys: 4 ms, total: 8 ms
Wall time: 15.7 s


### Smoothing by convolving with gaussian window

In [8]:
# %load lib/numpy_pack.py
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())


def unpackArray(x,data_type=np.int16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

In [89]:
from astropy.convolution import convolve
from scipy import signal
from copy import deepcopy
#using astrophy.convolution.convolve and not scipy.signal.convolve because the first can handle nans.

order=101
std=20
window = signal.gaussian(order, std=std)
window/=sum(window)

def Smoother(item):
    key,List = item
    
    sorted_List=sorted(List,key=lambda row:row['Year'])
    L=[(Row['Year'],unpackArray(Row['Values'],np.float16)) for Row in sorted_List]
    
    orig=np.stack([V[1] for V in L])
    print('orig.shape=',orig.shape)
    orig_shape=orig.shape
    orig=orig.flatten()
    smoothed = convolve(orig, window)
    smoothed=np.reshape(smoothed,orig_shape)
    print('smoothed.shape=',smoothed.shape)

    #create a list of Rows with the smoothed 
    new_L = []
    new_name = List[0]['Measurement']+'_s%d'%std
    for i in range(len(List)):
        new_row = deepcopy(List[i].asDict())
        new_row['Measurement']=new_name
        new_row['Values']=packArray(smoothed[i,:])
        new_L.append(Row(**new_row))

    return new_L


In [90]:
%%time
keyVal=jdf.rdd.map(lambda row:((row['Station'],row['Measurement']),[row]))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 39.3 µs


In [91]:
%%time
Reduced=keyVal.reduceByKey(lambda x,y:x+y)
item = Reduced.first()

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 14.6 s


In [92]:
new_L = Smoother(item)
len(new_L[0]['Values'])

orig.shape= (39, 365)
smoothed.shape= (39, 365)


730

In [93]:
len(new_L),len(item[1])

(39, 39)

In [94]:
Smoothed=Reduced.flatMap(Smoother)

In [95]:
%%time
X=Smoothed.first()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 3.09 s


In [16]:
%%time
Smoothed.cache().count()

CPU times: user 20 ms, sys: 0 ns, total: 20 ms
Wall time: 1min 7s


3259494

In [17]:
%%time
jdf.count()

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.23 s


3259494

### Changing order of dataframe columns

It turns out that to take the union of two dataframes, the order of the columns needs to be the same, it is not enough if the columns match by name

In [99]:
fieldNames=[X.name for X in jdf.schema.fields]
fieldNames

['Station',
 'Measurement',
 'Year',
 'Values',
 'dist_coast',
 'latitude',
 'longitude',
 'elevation',
 'state',
 'name']

In [None]:
%%time 
#Using the schema from JDF causes an error, it works without specifying a schema explicitly (the rows have the type info)
smoothed_jdf=sqlContext.createDataFrame(Smoothed,verifySchema=True)

In [150]:
smoothed_jdf=smoothed_jdf.select(fieldNames)

jdf.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Measurement: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Values: binary (nullable = true)
 |-- dist_coast: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



In [129]:
smoothed_jdf.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Measurement: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Values: binary (nullable = true)
 |-- dist_coast: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



## Combine original and smoothed rows into a single dataframe

In [151]:
%%time
Combined_jdf=jdf.union(smoothed_jdf)
Combined_jdf.count()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 1min 15s


In [155]:
Combined_jdf.count()

6518988

In [156]:
%%time
filename='US_Weather_with_smoothed.parquet'
outfilename='/weather/'+filename
s3filename='s3://dse-weather/'+filename
!hdfs dfs -rm -r $outfilename   #remove old copy
Combined_jdf.write.save(outfilename)

Deleted /weather/US_Weather_with_smoothed.parquet
CPU times: user 60 ms, sys: 16 ms, total: 76 ms
Wall time: 1min 24s


In [157]:
!hdfs dfs -du -h /weather/

2.6 G    /weather/US_Weather_with_smoothed.parquet
827.9 K  /weather/US_stations.parquet
1.9 G    /weather/US_weather.parquet


In [158]:
%%time
!s3-dist-cp --src $outfilename --dest $s3filename

18/04/19 00:06:17 INFO s3distcp.S3DistCp: Running with args: -libjars /usr/share/aws/emr/s3-dist-cp/lib/commons-httpclient-3.1.jar,/usr/share/aws/emr/s3-dist-cp/lib/commons-logging-1.0.4.jar,/usr/share/aws/emr/s3-dist-cp/lib/guava-18.0.jar,/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp-2.10.0.jar,/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar --src /weather/US_Weather_with_smoothed.parquet --dest s3://dse-weather/US_Weather_with_smoothed.parquet 
18/04/19 00:06:18 INFO s3distcp.S3DistCp: S3DistCp args: --src /weather/US_Weather_with_smoothed.parquet --dest s3://dse-weather/US_Weather_with_smoothed.parquet 
18/04/19 00:06:18 INFO s3distcp.S3DistCp: Using output path 'hdfs:/tmp/b0136120-fe18-4f2d-9cc4-d15f60fd3009/output'
18/04/19 00:06:18 INFO s3distcp.S3DistCp: GET http://169.254.169.254/latest/meta-data/placement/availability-zone result: us-east-1a
18/04/19 00:06:19 INFO s3distcp.FileInfoListing: Opening new file: hdfs:/tmp/b0136120-fe18-4f2d-9cc4-d15f60fd3009/files/1
18/04/19 00:06:

In [166]:
!aws s3 ls s3://dse-weather/US_Weather_with_smoothed.parquet/

2018-04-19 00:06:40          0 _SUCCESS
2018-04-19 00:06:42   79711937 part-00000-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:42   80341446 part-00001-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:47   77394575 part-00002-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:39   74556551 part-00003-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:44   77102526 part-00004-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:46   74678869 part-00005-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:40   72640052 part-00006-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:40   70800754 part-00007-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:43   68902426 part-00008-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:45   71010476 part-00009-e97f4502-af58-4cb

's3://dse-weather/US_Weather_with_smoothed.parquet'