In [3]:
# Set to True if running notebook on AWS/EMR
EMR=True

In [4]:
if not EMR:
    import findspark
    findspark.init()
from pyspark import SparkContext,SparkConf

def create_sc(pyFiles):
    sc_conf = SparkConf()
    sc_conf.setAppName("Weather_PCA")
    sc_conf.set('spark.executor.memory', '3g')
    sc_conf.set('spark.executor.cores', '1')
    sc_conf.set('spark.cores.max', '4')
    sc_conf.set('spark.default.parallelism','10')
    sc_conf.set('spark.logConf', True)
    print(sc_conf.getAll())

    sc = SparkContext(conf=sc_conf,pyFiles=pyFiles)

    return sc 

sc = create_sc(pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStatistics.py'])

dict_items([('spark.app.name', 'Weather_PCA'), ('spark.executor.memory', '3g'), ('spark.executor.cores', '1'), ('spark.cores.max', '4'), ('spark.default.parallelism', '10'), ('spark.logConf', 'True')])


In [5]:
from pyspark.sql.types import ArrayType,FloatType, DoubleType, StringType, IntegerType
from pyspark.sql.functions import udf
from pyspark.sql import *
sqlContext = SQLContext(sc)

import numpy as np
from lib.computeStatistics import *

In [10]:
# Copy and untar all of the state.tgz files into parquet files on hdfs.

data_dir='/mnt/workspace/Data'
!hdfs dfs -mkdir /weather/
!hdfs dfs -CopyFromLocal $data_dir/$parquet /weather/$parquet

#!aws s3 cp --recursive --quiet /mnt/workspace/Data/NY.parquet s3://dse-weather/NY.parquet

!aws s3 ls s3://dse-weather/

local_path=data_dir+'/'+parquet
hdfs_path='/weather/'+parquet
local_path,hdfs_path

!hdfs dfs -copyFromLocal $local_path $hdfs_path

!hdfs dfs -du /weather/
parquet_path=hdfs_path

In [20]:
parquet_path = data_dir+'/'+parquet

df=sqlContext.read.parquet(parquet_path)
print(df.cache().count())

 76M	../../../Data/Weather/NY.parquet


In [24]:
sqlContext.registerDataFrameAsTable(df,'table')

In [25]:
Query="""
SELECT Measurement,count(Measurement) as count 
FROM table 
GROUP BY Measurement
ORDER BY Measurement
"""
counts=sqlContext.sql(Query).toPandas()
measurements=list(counts['Measurement'])
print(counts)

In [43]:
def Count_nan(V):
    A=unpackArray(V,data_type=np.float16)
    return int(sum(np.isnan(A)))  # the int() is important, sparksql does not accept numpy ints
Count_nan_udf = udf(Count_nan,IntegerType())

df=df.withColumn("nan_no", Count_nan_udf(df.Values))

%%time
count_nans=df.select('nan_no').toPandas()

count_nans.plot.hist('nan_no',bins=100);

In [47]:
print('before removing >= 50 nans',df.count())
df=df.filter(df.nan_no < 50)
print('after removing >= 50 nans',df.count())

138882
138882


In [48]:
%%time 
### This is the main cell, where all of the statistics are computed.
STAT=computeStatistics(sqlContext,df)

SELECT * FROM weather
	WHERE measurement = 'SNWD'
SNWD : shape of mdf is  8496
time for SNWD is 21.034867763519287
SELECT * FROM weather
	WHERE measurement = 'TOBS'
TOBS : shape of mdf is  9299
time for TOBS is 24.098997116088867
SELECT * FROM weather
	WHERE measurement = 'SNOW'
SNOW : shape of mdf is  9748
time for SNOW is 24.903972864151
SELECT * FROM weather
	WHERE measurement = 'TOBS_s20'
TOBS_s20 : shape of mdf is  10121
time for TOBS_s20 is 28.332376956939697
SELECT * FROM weather
	WHERE measurement = 'SNWD_s20'
SNWD_s20 : shape of mdf is  10932
time for SNWD_s20 is 27.96428894996643
SELECT * FROM weather
	WHERE measurement = 'TMAX'
TMAX : shape of mdf is  11806
time for TMAX is 30.742621898651123
SELECT * FROM weather
	WHERE measurement = 'TMIN'
TMIN : shape of mdf is  11810
time for TMIN is 29.904306888580322
SELECT * FROM weather
	WHERE measurement = 'SNOW_s20'
SNOW_s20 : shape of mdf is  12155
time for SNOW_s20 is 29.573682069778442
SELECT * FROM weather
	WHERE measurement = 

In [49]:
STAT.keys()

dict_keys(['SNWD', 'TOBS', 'SNOW', 'TOBS_s20', 'SNWD_s20', 'TMAX', 'TMIN', 'SNOW_s20', 'TMAX_s20', 'TMIN_s20', 'PRCP', 'PRCP_s20'])

In [50]:
#STAT['PRCP_s20']

{'Cov': array([[152.54139297, 151.99358305, 151.6271789 , ...,  43.59008177,
          43.62247552,  43.62069043],
        [151.99358305, 151.53111559, 151.26183084, ...,  43.52699552,
          43.56279163,  43.56541584],
        [151.6271789 , 151.26183084, 151.04850899, ...,  43.56854256,
          43.60571094,  43.61080896],
        ...,
        [ 43.59008177,  43.52699552,  43.56854256, ..., 165.83235664,
         165.36886665, 164.78998932],
        [ 43.62247552,  43.56279163,  43.60571094, ..., 165.36886665,
         165.02201903, 164.5457432 ],
        [ 43.62069043,  43.56541584,  43.61080896, ..., 164.78998932,
         164.5457432 , 164.18267445]]),
 'E': array([381981.27807617, 380665.18933105, 379449.90112305, 378288.28643799,
        377060.58642578, 375812.69482422, 374726.16235352, 373652.63305664,
        372573.05078125, 372291.34716797, 371091.51081848, 370141.54455566,
        369156.47302246, 368409.22125244, 367582.43328857, 366795.56811523,
        366087.139770

In [13]:
print("   Name  \t                 Description             \t  Size")
print("-"*80)
print('\n'.join(["%10s\t%40s\t%s"%(s[0],s[1],str(s[2])) for s in STAT_Descriptions]))

   Name  	                 Description             	  Size
--------------------------------------------------------------------------------
SortedVals	                        Sample of values	vector whose length varies between measurements
     UnDef	      sample of number of undefs per row	vector whose length varies between measurements
      mean	                              mean value	()
       std	                                     std	()
    low100	                               bottom 1%	()
   high100	                                  top 1%	()
   low1000	                             bottom 0.1%	()
  high1000	                                top 0.1%	()
         E	                   Sum of values per day	(365,)
        NE	                 count of values per day	(365,)
      Mean	                                    E/NE	(365,)
         O	                   Sum of outer products	(365, 365)
        NO	               counts for outer products	(365, 365)
       Cov	                

In [51]:
## Dump STAT and STST_Descriptions into a pickle file.
from pickle import dump

filename=data_dir+'/STAT_%s.pickle'%state
dump((STAT,STAT_Descriptions),open(filename,'wb'))
!ls -l $data_dir

total 283296
drwxr-xr-x  31 yoavfreund  staff       992 Apr 18 18:12 [34mNY.parquet[m[m
-rw-r--r--   1 yoavfreund  staff  66288146 Apr 25 18:59 NY.tgz
-rw-r--r--   1 yoavfreund  staff  51365694 Apr 25 19:11 STAT_NY.pickle
-rw-r--r--   1 yoavfreund  staff  25684524 Apr  1 14:31 WeatherSTAT_NY.pickle
drwxr-xr-x   7 yoavfreund  staff       224 Apr 10 14:38 [34mWeather_Stations.parquet[m[m
-rw-r--r--   1 yoavfreund  staff   1605162 Apr 10 14:38 Weather_stations.tgz
drwxr-xr-x  10 yoavfreund  staff       320 Apr 16 14:23 [34mdecomp_NY_SNWD.parquet[m[m
drwxr-xr-x  10 yoavfreund  staff       320 Apr 16 14:22 [34mdecon_NY_SNWD.parquet[m[m
drwxr-xr-x  10 yoavfreund  staff       320 Apr 13 12:45 [34mrecon_NY_SNWD.parquet[m[m
drwxr-xr-x  10 yoavfreund  staff       320 Apr 10 14:22 [34mstations.parquet[m[m


In [27]:
X=STAT['TMAX']['Var']
for key in STAT.keys():
    Y=STAT[key]['Var']
    print(key,sum(abs(X-Y)))

TOBS 0.0
TOBS_s20 0.0
TMAX 0.0
TMAX_s20 0.0
TMIN_s20 0.0
TMIN 0.0
SNWD_s20 0.0
SNWD 0.0
SNOW_s20 0.0
SNOW 0.0
PRCP_s20 0.0
PRCP 0.0


In [29]:
!ls -l ../../Data/Weather/STAT*

-rw-r--r--  1 yoavfreund  staff  25684524 Apr  1 14:31 ../../Data/Weather/STAT_NY.pickle


In [30]:
!gzip -f -k ../../Data/Weather/STAT*.pickle
!ls -l ../../Data/Weather/STAT*

-rw-r--r--  1 yoavfreund  staff  14948011 Apr  1 14:31 ../../Data/Weather/STAT_NY.pickle.gz


In [31]:
for state in ['NY']:
    command="aws s3  cp ../../Data/Weather/STAT_%s.pickle.gz s3://mas-dse-open/Weather/by_state/STAT_%s.pickle.gz"%(state,state)
    print(command)
    !$command

aws s3  cp ../../Data/Weather/STAT_NY.pickle.gz s3://mas-dse-open/Weather/by_state/STAT_NY.pickle.gz
upload: ../../Data/Weather/STAT_NY.pickle.gz to s3://mas-dse-open/Weather/by_state/STAT_NY.pickle.gz


In [32]:
!aws s3  ls s3://mas-dse-open/Weather/by_state/ | grep STAT

2018-04-01 14:32:56   14948011 STAT_NY.pickle.gz
2018-03-18 20:33:54   11717259 STAT_RI.pickle.gz


### Summary
* We discussed how to compute the covariance matrix and the expectation matrix when there are `nan` entries.
* The details are all in `computeStatistics`, which is defined in python files you can find in the directory `lib`