# Splitting the US dataset into smaller datasets
We partition the parquet file along the dimensions of longitude, latitude and time.
The split is done in a KD-tree manner - splitting the longer dimension on the median.

The median is weighted by the number of valid measurement years.

In [26]:
sc.stop()

In [28]:
from pyspark import SparkContext

sc = SparkContext(master=master_url,pyFiles=['lib/numpy_pack.py'])

In [29]:
bucket='mas-dse-public'
dir='/home/ec2-user/spark/'
s3helper.open_bucket(bucket)
from time import time

In [7]:
t=time()
files = s3helper.s3_to_hdfs('/Weather/US_Weather.parquet', '/US_Weather.parquet')
print 'copying took',time()-t,'seconds'
## copying took 62.695912838, 74.9509780407

Found 190 items
-rw-r--r--   3 ec2-user supergroup          0 2017-03-14 00:40 /US_Weather.parquet/_SUCCESS
-rw-r--r--   3 ec2-user supergroup      26440 2017-03-14 00:40 /US_Weather.parquet/_common_metadata
-rw-r--r--   3 ec2-user supergroup   13106337 2017-03-14 00:40 /US_Weather.parquet/_metadata
-rw-r--r--   3 ec2-user supergroup   27983129 2017-03-14 00:40 /US_Weather.parquet/part-r-00000-0f4998c0-b27b-4f60-ad45-ed3212ddb46f.gz.parquet
-rw-r--r--   3 ec2-user supergroup    6570243 2017-03-14 00:40 /US_Weather.parquet/part-r-00001-0f4998c0-b27b-4f60-ad45-ed3212ddb46f.gz.parquet
-rw-r--r--   3 ec2-user supergroup   27923948 2017-03-14 00:40 /US_Weather.parquet/part-r-00002-0f4998c0-b27b-4f60-ad45-ed3212ddb46f.gz.parquet
-rw-r--r--   3 ec2-user supergroup    5822418 2017-03-14 00:40 /US_Weather.parquet/part-r-00003-0f4998c0-b27b-4f60-ad45-ed3212ddb46f.gz.parquet
-rw-r--r--   3 ec2-user supergroup   30277393 2017-03-14 00:40 /US_Weather.parquet/part-r-00004-0f4998c0-b27b-4f60-ad45-ed3

In [30]:
print s3helper.ls_s3()  # By default, list all files in the root directory of the bucket
print s3helper.ls_s3('Weather')

print 'HDFS:',s3helper.ls_hdfs()

[u'Scripts', u'Scripts_$folder$', u'Spark-Data', u'Spark-Data_$folder$', u'Weather', u'Weather_$folder$', u'moby10b.txt']
[u'Weather/PCA_PRCP.pickle', u'Weather/PCA_SNOW.pickle', u'Weather/PCA_TMAX.pickle', u'Weather/STAT1.pickle', u'Weather/SampleStations.pickle', u'Weather/SampleStations_copy.pickle', u'Weather/US_Weather.parquet', u'Weather/US_Weather_Cleaned.parquet', u'Weather/US_Weather_Cleaned.parquet_$folder$', u'Weather/US_Weather_SSBBSBBS.csv.gz', u'Weather/US_Weather_cleaned_labeled.parquet', u'Weather/US_Weather_cleaned_labeled.parquet_$folder$', u'Weather/US_counts.pickle', u'Weather/Weather.parquet', u'Weather/Weather_Stations.parquet', u'Weather/test.json', u'Weather_$folder$']
HDFS: Found 4 items
drwxr-xr-x   - ec2-user supergroup          0 2017-03-14 00:41 /US_Weather.parquet
drwxr-xr-x   - ec2-user supergroup          0 2017-03-13 22:25 /US_Weather_Cleaned.parquet
drwxr-xr-x   - ec2-user supergroup          0 2017-03-13 22:26 /US_Weather_cleaned_labeled.parquet
drwxr

In [31]:
from pyspark import SparkContext
from pyspark.sql import SQLContext,Row


sqlContext = SQLContext(sc)

In [32]:
US_Weather_parquet='/US_Weather.parquet'
measurements=['TMAX','TMIN','TOBS','SNOW','SNWD','PRCP']
Query="SELECT * FROM parquet.`%s`\n\tWHERE "%US_Weather_parquet+"\n\tor ".join(["measurement='%s'"%m for m in measurements])
print Query

SELECT * FROM parquet.`/US_Weather.parquet`
	WHERE measurement='TMAX'
	or measurement='TMIN'
	or measurement='TOBS'
	or measurement='SNOW'
	or measurement='SNWD'
	or measurement='PRCP'


In [33]:
t=time()
df = sqlContext.sql(Query).cache()
print df.count()
print 'took',time()-t,'seconds'

4351091
took 30.3726480007 seconds


In [7]:
t=time()
print df.count()
print 'took',time()-t,'seconds'

4351091
took 0.424700975418 seconds


In [8]:
import numpy as np
t=time()

N=sc.defaultParallelism
print 'Number of executors=',N
rdd0=df.rdd.map(lambda row:(str(row['station']),((str(row['measurement'])\
                        ,row['year']),np.array([np.float64(row[str(i)]) for i in range(1,366)])))).cache().repartition(N)
print 'took',time()-t,'seconds'

# Number of executors= 80
# took 0.0209968090057 seconds

Number of executors= 80
took 0.0609848499298 seconds


In [9]:
t=time()
print rdd0.count()
print 'took',time()-t,'seconds'

# 4351091
# took 304.655529022 seconds

4351091
took 304.655529022 seconds


In [10]:
t=time()
print rdd0.count()
print 'took',time()-t,'seconds'

# 4351091
# took 2.08423686028 seconds

4351091
took 2.08423686028 seconds


In [11]:
t=time()
print rdd0.repartition(N).count()
print 'took',time()-t,'seconds'
# 4351091
# took 5.36106610298 seconds

4351091
took 4.99785280228 seconds


In [12]:
# Compute the overall distribution of values and the distribution of the number of nan per year
def find_percentiles(SortedVals,percentile):
  L=len(SortedVals)/percentile
  return SortedVals[L],SortedVals[-L]
  
def computeOverAllDist(rdd0):
  UnDef=np.array(rdd0.map(lambda row:sum(np.isnan(row))).sample(False,0.01).collect())
  flat=rdd0.flatMap(lambda v:list(v)).filter(lambda x: not np.isnan(x)).cache()
  count,S1,S2=flat.map(lambda x: np.float64([1,x,x**2]))\
                  .reduce(lambda x,y: x+y)
  mean=S1/count
  std=np.sqrt(S2/count-mean**2)
  Vals=flat.sample(False,0.0001).collect()
  SortedVals=np.array(sorted(Vals))
  low100,high100=find_percentiles(SortedVals,100)
  low1000,high1000=find_percentiles(SortedVals,1000)
  return {'UnDef':UnDef,\
          'mean':mean,\
          'std':std,\
          'SortedVals':SortedVals,\
          'low100':low100,\
          'high100':high100,\
          'low1000':low100,\
          'high1000':high1000
          }

In [24]:
from numpy import linalg as LA

STAT={}  # dictionary storing the statistics for each measurement
df={}
for meas in measurements:
    t=time()
    Query="SELECT * FROM parquet.`%s`\n\tWHERE measurement = '%s'"%(US_Weather_parquet,meas)
    print Query
    df[meas] = sqlContext.sql(Query)
    

    print 'size of dataframe for ',meas,'before filtering =',df[meas].count()
    rdd0=df[meas].rdd.map(lambda row:(row['station'],((row['measurement'],row['year']),np.array([np.float64(row[str(i)]) for i in range(1,366)])))).cache()

    rdd1=rdd0.sample(False,1)\
           .map(lambda (key,val): val[1])\
           .cache()\
           .repartition(N)

    STAT[meas]=computeOverAllDist(rdd1)   # Compute the statistics 
    low1000 = STAT[meas]['low1000']  # unpack the extreme values statistics
    high1000 = STAT[meas]['high1000']

    RDD=df[meas].rdd
    Cleaned=RDD.map(lambda row:repack_array(row))
    df[meas]=sqlContext.createDataFrame(Cleaned)
    df[meas]=df[meas].filter(lambda row:row['undef No']<50)
    print 'size of dataframe for ',meas,'after filtering =',df[meas].count()
 
    print 'time for ',meas,'was',time()-t
    

SELECT * FROM parquet.`/US_Weather.parquet`
	WHERE measurement = 'TMAX'
size of dataframe for  TMAX before filtering = 662767


Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 32.0 failed 4 times, most recent failure: Lost task 0.3 in stage 32.0 (TID 1440, 172.31.29.244, executor 0): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/worker.py", line 163, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/worker.py", line 54, in read_command
    command = serializer._read_with_length(file)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
    return self.loads(obj)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 434, in loads
    return pickle.loads(obj)
ImportError: No module named numpy_pack

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1435)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1423)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1422)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1422)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:802)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1650)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1605)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1594)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:628)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1918)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1931)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1944)
	at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:441)
	at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/worker.py", line 163, in main
    func, profiler, deserializer, serializer = read_command(pickleSer, infile)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/worker.py", line 54, in read_command
    command = serializer._read_with_length(file)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 169, in _read_with_length
    return self.loads(obj)
  File "/home/ec2-user/spark/python/lib/pyspark.zip/pyspark/serializers.py", line 434, in loads
    return pickle.loads(obj)
ImportError: No module named numpy_pack

	at org.apache.spark.api.python.PythonRunner$$anon$1.read(PythonRDD.scala:193)
	at org.apache.spark.api.python.PythonRunner$$anon$1.<init>(PythonRDD.scala:234)
	at org.apache.spark.api.python.PythonRunner.compute(PythonRDD.scala:152)
	at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:63)
	at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:323)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:287)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:99)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:282)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	... 1 more


In [None]:

    #clean up table from extreme values and from rows with too many undefinde entries.
    rdd2=rdd1.map(lambda V: np.array([x if (x>low1000-1) and (x<high1000+1) else np.nan for x in V]))

    df[meas].withColumn('undef No',rdd2.map(lambda row: sum(np.isnan(row))))
    df[meas]=df[meas].filter(lambda row:row['undef No']<50)
    print 'size of dataframe for ',meas,'after filtering =',df[meas].count()
    break

In [25]:
import sys
sys.path.append('./lib')
from numpy_pack import packArray,unpackArray

In [14]:
# %load lib/numpy_pack.py
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())
def unpackArray(x,data_type=np.int16):
    return np.frombuffer(x,dtype=data_type)

In [23]:
import sys
sys.path.append('./lib')
from numpy_pack import packArray,unpackArray

def repack_array(row):
    arr=(np.array([np.float16(row[str(i)]) for i in range(1,366)]))
    #trim extremes
    trimmed=np.array([x if (x>low1000-1) and (x<high1000+1) else np.nan for x in arr],dtype=np.float16)
    #count nan
    count=np.sum(np.isnan(trimmed))
    #return row with 365 entries replaced with a bytestream representing an array.

    New={}
    keep_keys=['longitude', 'station', 'latitude', 'year', 'elevation', 'measurement']
    for k in keep_keys:
        New[k]=row[k]
    New['vector']=packArray(trimmed)
    New['undefs']=count
    return Row(**New)


In [15]:
RDD=df[meas].rdd
Cleaned=RDD.map(lambda row:repack_array(row))
Cleaned.first()

NameError: name 'meas' is not defined

In [189]:
df_clean=sqlContext.createDataFrame(Cleaned)

TypeError: not supported type: <type 'numpy.int64'>

In [19]:
# %load lib/numpy_pack.py
import numpy as np
"""Code for packing and unpacking a numpy array into a byte array.
   the array is flattened if it is not 1D.
   This is intended to be used as the interface for storing 
   
   This code is intended to be used to store numpy array as fields in a dataframe and then store the 
   dataframes in a parquet file.
"""

def packArray(a):
    if type(a)!=np.ndarray:
        raise Exception("input to packArray should be numpy.ndarray. It is instead "+str(type(a)))
    return bytearray(a.tobytes())
def unpackArray(x,data_type=np.int16):
    return np.frombuffer(x,dtype=data_type)

In [87]:
df2=df[meas].map(lambda V: np.array([x if (x>low1000-1) and (x<high1000+1) else np.nan for x in V]))

In [88]:
df[meas].withColumn('undef No',rdd2.map(lambda row: sum(np.isnan(row))))

AssertionError: col should be Column

In [None]:
  low1000 = STAT[meas]['low1000']  # unpack the extreme values statistics
  high1000 = STAT[meas]['high1000']

  #clean up table from extreme values and from rows with too many undefinde entries.
  rdd2=rdd0.map(lambda key,V: np.array([x if (x>low1000-1) and (x<high1000+1) else np.nan for x in V]))
  Clean_com
  #rdd3=rdd2.filter(lambda row:sum(np.isnan(row))<50)
  Clean_Tables[meas]=rdd3.cache().repartition(N)
  C=Clean_Tables[meas].count()
  print 'for measurement %s, we get %d clean rows'%(meas,C)


In [37]:
df.filter(lambda row: )

['TMIN', 'TOBS', 'TMAX', 'SNOW', 'SNWD', 'PRCP']

In [52]:
df.take(1)

[Row(station=u'USC00305618', measurement=u'PRCP', year=1914.0, 1=nan, 2=nan, 3=nan, 4=nan, 5=nan, 6=nan, 7=nan, 8=nan, 9=nan, 10=nan, 11=nan, 12=nan, 13=nan, 14=nan, 15=nan, 16=nan, 17=nan, 18=nan, 19=nan, 20=nan, 21=nan, 22=nan, 23=nan, 24=nan, 25=nan, 26=nan, 27=nan, 28=nan, 29=nan, 30=nan, 31=nan, 32=nan, 33=nan, 34=nan, 35=nan, 36=nan, 37=nan, 38=nan, 39=nan, 40=nan, 41=nan, 42=nan, 43=nan, 44=nan, 45=nan, 46=nan, 47=nan, 48=nan, 49=nan, 50=nan, 51=nan, 52=nan, 53=nan, 54=nan, 55=nan, 56=nan, 57=nan, 58=nan, 59=nan, 60=nan, 61=nan, 62=nan, 63=nan, 64=nan, 65=nan, 66=nan, 67=nan, 68=nan, 69=nan, 70=nan, 71=nan, 72=nan, 73=nan, 74=nan, 75=nan, 76=nan, 77=nan, 78=nan, 79=nan, 80=nan, 81=nan, 82=nan, 83=nan, 84=nan, 85=nan, 86=nan, 87=nan, 88=nan, 89=nan, 90=nan, 91=nan, 92=nan, 93=nan, 94=nan, 95=nan, 96=nan, 97=nan, 98=nan, 99=nan, 100=nan, 101=nan, 102=nan, 103=nan, 104=nan, 105=nan, 106=nan, 107=nan, 108=nan, 109=nan, 110=nan, 111=nan, 112=nan, 113=nan, 114=nan, 115=nan, 116=nan, 1

In [43]:
from pyspark.sql import DataFrame

In [55]:
t=time()
df_clean = df['TMIN']
s=df.count()
print 'TMIN',s
for key in ['TOBS', 'TMAX', 'SNOW', 'SNWD', 'PRCP']:
    df_clean=df_clean.union(df[key])
    s1= df[key].count()
    s+= s1
    print key,s1,s
print 'total size=',df_clean.count()
print 'took',time()-t,'seconds'

KeyError: 'TMIN'

In [47]:
Clean_combined.take(1)

[array([ -20.,   20.,  -60.,  -20.,  -20.,    5.,   20.,  -20.,  -20.,
          30.,  -50.,  -60.,  -20.,  -10.,  -10.,  -10.,  -20.,  -40.,
         -30.,  -10.,  -50.,   nan,  -10.,  -60.,  -70.,  -50.,  -50.,
         -60.,  -10.,  -20.,   30.,  -40.,  -30.,  -30.,   40.,   50.,
          20.,   50.,   60.,  -40.,  -60.,  -40.,  -30.,  -30.,  -30.,
         -40.,   40.,  -20.,  -40.,   10.,   30.,   30.,  -20.,  -50.,
         -20.,  -10.,  -10.,  -20.,  -20.,  -10.,  -20.,   nan,   10.,
         -20.,  -20.,  -40.,  -10.,  -20.,  -20.,  -20.,  -30.,  -10.,
         -20.,  -10.,  -60.,  -10.,  -10.,   60.,   10.,  -20.,  -20.,
          20.,  -30.,  -50.,  -20.,  -40.,  -30.,  -20.,  -20.,  -20.,
         -10.,  -10.,  -70.,  -60.,  -60.,  -60.,  -30.,  -20.,  -30.,
         -20.,  -20.,   20.,   60.,   70.,   70.,  -20.,  -10.,  -10.,
         -10.,   40.,   40.,   60.,   20.,  -10.,  -10.,   30.,   30.,
          30.,   40.,  100.,   30.,   40.,   50.,   80.,   90.,   90.,
      

### Sample Stations
Generate a sample of stations, for each one store all available year X measurement pairs.

In [54]:
rdd0.take(10) # test output

[(u'USC00305618',
  ((u'PRCP', 1914.0),
   array([  nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
            nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,   nan,
           

In [73]:
groups=rdd0.groupByKey().cache()
print 'number of stations=',groups.count()

groups1=groups.sample(False,0.01).collect()
groups2=[(g[0],[e for e in g[1]]) for g in groups1]

dumpS3(groups2,'/Weather/','SampleStations.pickle')

number of stations= 45833
-rw-rw-r-- 1 ec2-user ec2-user 110449557 Mar  9 06:10 SampleStations.pickle


In [19]:
group_Sample=rdd0.sample(False,0.001).groupByKey().mapValues(list).cache()
group_Sample.first()

In [76]:
Means=rdd0.aggregateByKey((np.zeros(365),1),\
                          lambda S,D: sumWithNan(S,(D[1],1)),\
                          lambda S1,S2: sumWithNanithNan(S1,S2))\
.cache()#.repartition(N)

In [77]:
Means.take(5)

[(u'USC00213727',
  (array([[   67.,   511.,   326.,   285.,   122.,   321.,   193.,   155.,
             398.,   224.,   421.,   390.,   260.,   254.,   484.,   342.,
             249.,   124.,   133.,   194.,   232.,   228.,   267.,   546.,
             635.,   270.,    92.,   430.,   328.,   239.,   324.,    71.,
             171.,    73.,   244.,   421.,   133.,    54.,    81.,   218.,
             317.,   181.,    95.,    91.,   211.,   184.,   302.,   153.,
             175.,   244.,   188.,   272.,   115.,   330.,   452.,   159.,
             203.,   420.,   393.,   254.,   449.,   412.,   533.,   369.,
              51.,   315.,   125.,   317.,   113.,   176.,   730.,   215.,
             203.,   663.,   307.,   208.,   448.,   525.,   405.,   544.,
             310.,   295.,   365.,   206.,   585.,   391.,   570.,   331.,
             528.,   289.,   571.,   507.,   379.,   581.,   336.,   622.,
             530.,   473.,   899.,   645.,   242.,   407.,   683.,   547.,
       

In [78]:
groups=rdd0.groupByKey().cache()
print 'number of stations=',groups.count()


number of stations= 45833
