### Smoothing Measurement sequences

In [5]:
%%time
import pandas as pd
import numpy as np
import sklearn as sk
import urllib
import math
%pylab inline

#import findspark
#findspark.init()

from pyspark import SparkContext
#sc.stop()
sc = SparkContext(pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStatistics.py'])

from pyspark import SparkContext
from pyspark.sql import *
import pyspark.sql
sqlContext = SQLContext(sc)

Populating the interactive namespace from numpy and matplotlib
CPU times: user 1.59 s, sys: 64 ms, total: 1.65 s
Wall time: 15.6 s


In [6]:
from pyspark.sql import Row

In [7]:
import numpy as np
from lib.numpy_pack import packArray,unpackArray
from lib.spark_PCA import computeCov
from lib.computeStatistics import computeOverAllDist, STAT_Descriptions

### Read data through open bucket

In [8]:
state='NY'
EMR=True
if not EMR:
    data_dir='../../Data/Weather'

    tarname=state+'.tgz'
    parquet=state+'.parquet'

    !rm -rf $data_dir/$tarname

    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s > %s/%s"%(tarname,data_dir,tarname)
    print(command)
    !$command
    !ls -lh $data_dir/$tarname
    cur_dir,=!pwd
    %cd $data_dir
    !tar -xzf $tarname
    !du ./$parquet
    %cd $cur_dir

    #read statistics
    filename='STAT_%s.pickle'%state
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/by_state/%s.gz > %s/%s.gz"%(filename,data_dir,filename)
    print(command)
    !$command
    
    filename='US_stations.tsv.gz'
    command="curl https://mas-dse-open.s3.amazonaws.com/Weather/Info/%s > %s/%s"%(filename,data_dir,filename)
    print(command)
    !$command
    filename_no_gz = filename[:-3]
    !gunzip -f $data_dir/$filename
    !ls -lh $data_dir/US_stations*

### Read data when on EMR

In [9]:
%%time
if EMR:
    !hdfs dfs -ls /weather/

    stations_df=sqlContext.read.parquet('/weather/US_stations.parquet')
    stations_df.show(3)

    weather_df=sqlContext.read.parquet('/weather/US_weather.parquet')
    weather_df.show(2)
    

Found 3 items
drwxr-xr-x   - hadoop hadoop          0 2018-04-22 15:31 /weather/US_Weather_with_smoothed.parquet
drwxr-xr-x   - hadoop hadoop          0 2018-04-22 15:20 /weather/US_stations.parquet
drwxr-xr-x   - hadoop hadoop          0 2018-04-22 15:20 /weather/US_weather.parquet
+-----------+----------+--------+---------+---------+-----+----------------+
|    Station|dist_coast|latitude|longitude|elevation|state|            name|
+-----------+----------+--------+---------+---------+-----+----------------+
|USC00341900|   739.956|    36.3| -96.4667|    242.3|   OK|       CLEVELAND|
|USC00428114|    908.22|    40.1|-111.6667|   1409.1|   UT|SPANISH FORK 1 S|
|USC00165926|   23.8801| 29.7853| -90.1158|      0.9|   LA|   MARRERO 9 SSW|
+-----------+----------+--------+---------+---------+-----+----------------+
only showing top 3 rows

+-----------+-----------+----+--------------------+
|    Station|Measurement|Year|              Values|
+-----------+-----------+----+------------------

In [10]:
weather_df.count()

3259494

In [11]:
%%time

jdf=weather_df.join(stations_df,on='Station',how='left')
print(jdf.count())
jdf.show(2)

3259494
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|    Station|Measurement|Year|              Values|dist_coast|latitude|longitude|elevation|state|             name|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
|CA001126150|       PRCP|1942|[00 00 80 4A 00 0...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
only showing top 2 rows

CPU times: user 0 ns, sys: 4 ms, total: 4 ms
Wall time: 13.1 s


### Smoothing by convolving with gaussian window

In [12]:
# %load lib/numpy_pack.py
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    """
    pack a numpy array into a bytearray that can be stored as a single 
    field in a spark DataFrame

    :param a: a numpy ndarray 
    :returns: a bytearray
    :rtype:

    """
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())


def unpackArray(x,data_type=np.int16):
    """
    unpack a bytearray into a numpy.ndarray

    :param x: a bytearray
    :param data_type: The dtype of the array. This is important because if determines how many bytes go into each entry in the array.
    :returns: a numpy array
    :rtype: a numpy ndarray of dtype data_type.

    """
    return np.frombuffer(x,dtype=data_type)

In [197]:
from astropy.convolution import convolve
from scipy import signal
from copy import deepcopy
#using astrophy.convolution.convolve and not scipy.signal.convolve because the first can handle nans.

order=101
std=20
window = signal.gaussian(order, std=std)
window/=sum(window)

def Smoother(item):
    key,List = item
    
    sorted_List=sorted(List,key=lambda row:row['Year'])
    L=[(Row['Year'],unpackArray(Row['Values'],np.float16)) for Row in sorted_List]
    
    orig=np.stack([V[1] for V in L])
    print('orig.shape=',orig.shape)
    orig_shape=orig.shape
    orig=orig.flatten()
    smoothed = convolve(orig, window)
    smoothed=np.reshape(smoothed,orig_shape)
    smoothed=np.array(smoothed,dtype=np.float16)
    print('smoothed.shape=',smoothed.shape)

    #create a list of Rows with the smoothed 
    new_L = []
    new_name = List[0]['Measurement']+'_s%d'%std
    for i in range(len(List)):
        row=List[i]
        new_row=Row(Station = row['Station'],\
                    Measurement=new_name,\
                    Year = int(row['Year']),\
                    Values = packArray(smoothed[i,:]),\
                    dist_coast = float(row['dist_coast']),\
                    latitude = float(row['latitude']),\
                    longitude = float(row['longitude']),\
                    elevation = float(row['elevation']),\
                    state = row['state'],\
                    name = row['name'])

        new_L.append(new_row)
    return new_L

In [198]:
Smoother(item)

orig.shape= (11, 365)
smoothed.shape= (11, 365)


[Row(Measurement='SNOW_s20', Station='USC00413644', Values=bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\

In [169]:
for i in range(len(jdf.schema.fields)):
    s=jdf.schema.fields[i]
    print("%s = row['%s'],\\"%(s.name,s.name))

Station = row['Station'],\
Measurement = row['Measurement'],\
Year = row['Year'],\
Values = row['Values'],\
dist_coast = row['dist_coast'],\
latitude = row['latitude'],\
longitude = row['longitude'],\
elevation = row['elevation'],\
state = row['state'],\
name = row['name'],\


In [170]:
from pyspark.sql import Row

In [199]:
%%time
keyVal=jdf.rdd.map(lambda row:((row['Station'],row['Measurement']),[row]))

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 39.6 µs


In [200]:
%%time
Reduced=keyVal.reduceByKey(lambda x,y:x+y)
item = Reduced.first()

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 14.5 s


In [201]:
new_L = Smoother(item)
len(new_L[0]['Values'])

orig.shape= (11, 365)
smoothed.shape= (11, 365)


730

In [202]:
len(new_L),len(item[1])

(11, 11)

In [203]:
Smoothed=Reduced.flatMap(Smoother)

In [204]:
%%time
X=Smoothed.first()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 3.1 s


In [207]:
%%time
Smoothed.cache().count()

CPU times: user 12 ms, sys: 0 ns, total: 12 ms
Wall time: 1.7 s


3259494

In [191]:
X=Smoothed.first()
Y=jdf.first()

In [196]:
type(X['Year']),type(Y['Year'])

(numpy.int16, int)

In [179]:
%%time
jdf.count()

CPU times: user 4 ms, sys: 0 ns, total: 4 ms
Wall time: 7.58 s


3259494

In [208]:
row,=Smoothed.take(1)
D=row.asDict()
[(name,type(D[name])) for name in D.keys()]

[('elevation', float),
 ('Values', bytearray),
 ('Year', int),
 ('name', str),
 ('dist_coast', float),
 ('longitude', float),
 ('Measurement', str),
 ('latitude', float),
 ('Station', str),
 ('state', str)]

In [181]:
row['Station']

'USC00413644'

### Changing order of dataframe columns

It turns out that to take the union of two dataframes, the order of the columns needs to be the same, it is not enough if the columns match by name

In [182]:
len(jdf.schema.fields)

10

In [183]:
from pyspark.sql.types import StringType, FloatType, BinaryType, IntegerType
typeMap={StringType:str,
         FloatType:float,
         IntegerType:int,
         BinaryType:bytearray}
typeMap

{pyspark.sql.types.StringType: str,
 pyspark.sql.types.IntegerType: int,
 pyspark.sql.types.FloatType: float,
 pyspark.sql.types.BinaryType: bytearray}

In [184]:
for i in range(len(jdf.schema.fields)):
    s=jdf.schema.fields[i]
    print(s.name,s.dataType)

Station StringType
Measurement StringType
Year IntegerType
Values BinaryType
dist_coast FloatType
latitude FloatType
longitude FloatType
elevation FloatType
state StringType
name StringType


In [209]:
%%time 
#Using the schema from JDF causes an error, it works without specifying a schema explicitly (the rows have the type info)
smoothed_jdf=sqlContext.createDataFrame(Smoothed,verifySchema=True)
smoothed_jdf.printSchema()

root
 |-- Measurement: string (nullable = true)
 |-- Station: string (nullable = true)
 |-- Values: binary (nullable = true)
 |-- Year: long (nullable = true)
 |-- dist_coast: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)

CPU times: user 16 ms, sys: 0 ns, total: 16 ms
Wall time: 50.2 ms


In [210]:
smoothed_jdf.show(1)

+-----------+-----------+--------------------+----+------------------+------------------+--------+-------------------+--------------+-----+
|Measurement|    Station|              Values|Year|        dist_coast|         elevation|latitude|          longitude|          name|state|
+-----------+-----------+--------------------+----+------------------+------------------+--------+-------------------+--------------+-----+
|   PRCP_s20|USC00051745|[FE 40 86 41 38 4...|1968|1005.4199829101562|1969.9000244140625|   39.25|-107.93329620361328|COLLBRAN 3 ENE|   CO|
+-----------+-----------+--------------------+----+------------------+------------------+--------+-------------------+--------------+-----+
only showing top 1 row



In [211]:
jdf.show(1)

+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|    Station|Measurement|Year|              Values|dist_coast|latitude|longitude|elevation|state|             name|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
|CA001126150|       PRCP|1941|[00 7E 00 7E 00 7...|   226.659|  49.467|   -119.6|    344.0|  NaN|PENTICTON AIRPORT|
+-----------+-----------+----+--------------------+----------+--------+---------+---------+-----+-----------------+
only showing top 1 row



In [None]:
fieldNames=jdf.schema.fieldNames()

In [212]:
fieldNames

['Station',
 'Measurement',
 'Year',
 'Values',
 'dist_coast',
 'latitude',
 'longitude',
 'elevation',
 'state',
 'name']

In [213]:
smoothed_jdf=smoothed_jdf.select(fieldNames)
smoothed_jdf.show(1)

+-----------+-----------+----+--------------------+------------------+--------+-------------------+------------------+-----+--------------+
|    Station|Measurement|Year|              Values|        dist_coast|latitude|          longitude|         elevation|state|          name|
+-----------+-----------+----+--------------------+------------------+--------+-------------------+------------------+-----+--------------+
|USC00051745|   PRCP_s20|1968|[FE 40 86 41 38 4...|1005.4199829101562|   39.25|-107.93329620361328|1969.9000244140625|   CO|COLLBRAN 3 ENE|
+-----------+-----------+----+--------------------+------------------+--------+-------------------+------------------+-----+--------------+
only showing top 1 row



In [214]:
jdf.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Measurement: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Values: binary (nullable = true)
 |-- dist_coast: float (nullable = true)
 |-- latitude: float (nullable = true)
 |-- longitude: float (nullable = true)
 |-- elevation: float (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



In [215]:
smoothed_jdf.printSchema()

root
 |-- Station: string (nullable = true)
 |-- Measurement: string (nullable = true)
 |-- Year: long (nullable = true)
 |-- Values: binary (nullable = true)
 |-- dist_coast: double (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- elevation: double (nullable = true)
 |-- state: string (nullable = true)
 |-- name: string (nullable = true)



# Stuck here
Managed to get the two dataframes to have the same coumn order. However, there are differences in the types: float vs double and integer vs. long, that I dont know how to fix.

## Combine original and smoothed rows into a single dataframe

In [29]:
%%time
Combined_jdf=jdf.union(smoothed_jdf)
Combined_jdf.count()

Py4JJavaError: An error occurred while calling o189.count.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 25 in stage 27.0 failed 4 times, most recent failure: Lost task 25.3 in stage 27.0 (TID 159, ip-10-129-250-3.ec2.internal, executor 10): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/pyspark/sql/session.py", line 671, in prepare
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1402, in verify_struct
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1347, in verify_integer
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1310, in verify_acceptable_types
TypeError: field Year: IntegerType can not accept object bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\xdc\\\xbc\\d\\a\\m\\{\\\x86\\\x83\\\x80\\s\\p\\p\\p\\o\\l\\h\\f\\f\\f\\e\\c\\c\\b\\\\\\Y\\X\\W\\U\\S\\Q\\O\\M\\K\\G\\E\\B\\>\\;\\8\\6\\3\\/\\+\\(\\$\\ \\\x1d\\\x19\\\x14\\\x10\\\x0c\\\x07\\\x02\\\xfa[\xf0[\xe4[\xda[\xcf[\xc4[\xb9[\xad[\xa1[\x95[\x88[{[m[_[P[B[3[$[\x15[\x05[\xf5Z\xe4Z\xd3Z\xc2Z\xb0Z\x9fZ\x8dZ|ZjZYZGZ5Z#Z\x11Z\xfeY\xecY\xdaY\xc8Y\xb7Y\xa5Y\x94Y\x83YqY`YOY>Y.Y\x1dY\x0cY\xfbX\xeaX\xdaX\xcbX\xbbX\xacX\x9dX\x8eX\x80XqXbXSXEX7X)X\x1cX\x0eX\x01X\xe8W\xcfW\xb6W\x9eW\x88WrWZWBW-W\x18W\x04W\xf1V\xdfV\xceV\xbfV\xb0V\xa2V\x95V\x8bV\x80VvVmVfV`V]VZVWVUVTVVVXVZV_VfVnVxV') in type <class 'bytearray'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.agg_doAggregateWithoutKey$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1750)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1738)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1737)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1737)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:871)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:871)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1971)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1920)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1909)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:682)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2027)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2048)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2067)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2092)
	at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:939)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:363)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:938)
	at org.apache.spark.sql.execution.SparkPlan.executeCollect(SparkPlan.scala:297)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2770)
	at org.apache.spark.sql.Dataset$$anonfun$count$1.apply(Dataset.scala:2769)
	at org.apache.spark.sql.Dataset$$anonfun$52.apply(Dataset.scala:3253)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
	at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3252)
	at org.apache.spark.sql.Dataset.count(Dataset.scala:2769)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/worker.py", line 229, in main
    process()
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/worker.py", line 224, in process
    serializer.dump_stream(func(split_index, iterator), outfile)
  File "/mnt/yarn/usercache/hadoop/appcache/application_1524409064425_0004/container_1524409064425_0004_01_000012/pyspark.zip/pyspark/serializers.py", line 372, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/usr/lib/spark/python/pyspark/sql/session.py", line 671, in prepare
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1402, in verify_struct
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1421, in verify
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1347, in verify_integer
  File "/usr/lib/spark/python/pyspark/sql/types.py", line 1310, in verify_acceptable_types
TypeError: field Year: IntegerType can not accept object bytearray(b'\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\x00~\xdc\\\xbc\\d\\a\\m\\{\\\x86\\\x83\\\x80\\s\\p\\p\\p\\o\\l\\h\\f\\f\\f\\e\\c\\c\\b\\\\\\Y\\X\\W\\U\\S\\Q\\O\\M\\K\\G\\E\\B\\>\\;\\8\\6\\3\\/\\+\\(\\$\\ \\\x1d\\\x19\\\x14\\\x10\\\x0c\\\x07\\\x02\\\xfa[\xf0[\xe4[\xda[\xcf[\xc4[\xb9[\xad[\xa1[\x95[\x88[{[m[_[P[B[3[$[\x15[\x05[\xf5Z\xe4Z\xd3Z\xc2Z\xb0Z\x9fZ\x8dZ|ZjZYZGZ5Z#Z\x11Z\xfeY\xecY\xdaY\xc8Y\xb7Y\xa5Y\x94Y\x83YqY`YOY>Y.Y\x1dY\x0cY\xfbX\xeaX\xdaX\xcbX\xbbX\xacX\x9dX\x8eX\x80XqXbXSXEX7X)X\x1cX\x0eX\x01X\xe8W\xcfW\xb6W\x9eW\x88WrWZWBW-W\x18W\x04W\xf1V\xdfV\xceV\xbfV\xb0V\xa2V\x95V\x8bV\x80VvVmVfV`V]VZVWVUVTVVVXVZV_VfVnVxV') in type <class 'bytearray'>

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:298)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:438)
	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRunner.scala:421)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:252)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator$$anon$12.hasNext(Iterator.scala:439)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage3.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.agg_doAggregateWithoutKey$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage4.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
	at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:408)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:125)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [155]:
Combined_jdf.count()

6518988

In [1]:
%%time
filename='US_Weather_with_smoothed.parquet'
outfilename='/weather/'+filename
s3filename='s3://dse-weather/'+filename

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 9.3 µs


In [156]:
!hdfs dfs -rm -r $outfilename   #remove old copy
Combined_jdf.write.save(outfilename)

Deleted /weather/US_Weather_with_smoothed.parquet
CPU times: user 60 ms, sys: 16 ms, total: 76 ms
Wall time: 1min 24s


In [2]:
!hdfs dfs -du -h /weather/

827.9 K  /weather/US_stations.parquet
1.9 G    /weather/US_weather.parquet


In [3]:
%%time
#!s3-dist-cp --src $outfilename --dest $s3filename
!s3-dist-cp --src $s3filename --dest $outfilename 

18/04/22 15:31:26 INFO s3distcp.S3DistCp: Running with args: -libjars /usr/share/aws/emr/s3-dist-cp/lib/commons-httpclient-3.1.jar,/usr/share/aws/emr/s3-dist-cp/lib/commons-logging-1.0.4.jar,/usr/share/aws/emr/s3-dist-cp/lib/guava-18.0.jar,/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp-2.10.0.jar,/usr/share/aws/emr/s3-dist-cp/lib/s3-dist-cp.jar --src s3://dse-weather/US_Weather_with_smoothed.parquet --dest /weather/US_Weather_with_smoothed.parquet 
18/04/22 15:31:27 INFO s3distcp.S3DistCp: S3DistCp args: --src s3://dse-weather/US_Weather_with_smoothed.parquet --dest /weather/US_Weather_with_smoothed.parquet 
18/04/22 15:31:27 INFO s3distcp.S3DistCp: Using output path 'hdfs:/tmp/6862ae49-728f-4267-afb4-4d12e6ee7eb3/output'
18/04/22 15:31:27 INFO s3distcp.S3DistCp: GET http://169.254.169.254/latest/meta-data/placement/availability-zone result: us-east-1a
18/04/22 15:31:30 WARN cred.CredentialsLegacyConfigLocationProvider: Found the legacy config profiles file at [/home/hadoop/.aws/config].

In [4]:
!aws s3 ls s3://dse-weather/US_Weather_with_smoothed.parquet/

2018-04-19 00:06:40          0 _SUCCESS
2018-04-19 00:06:42   79711937 part-00000-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:42   80341446 part-00001-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:47   77394575 part-00002-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:39   74556551 part-00003-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:44   77102526 part-00004-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:46   74678869 part-00005-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:40   72640052 part-00006-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:40   70800754 part-00007-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:43   68902426 part-00008-e97f4502-af58-4cb8-a949-14e7c6d9e6b3-c000.snappy.parquet
2018-04-19 00:06:45   71010476 part-00009-e97f4502-af58-4cb