## Computing the statistics

In [1]:
%%time
%pwd
from pyspark import SparkContext
sc = SparkContext(pyFiles=['/mnt/workspace/edX-Micro-Master-in-Data-Science/big-data-analytics-using-spark/notebooks/Section2-PCA/PCA/data_preparation/lib/numpy_pack.py'])

from pyspark import SparkContext
from pyspark.sql import SQLContext

sqlContext = SQLContext(sc)

CPU times: user 164 ms, sys: 24 ms, total: 188 ms
Wall time: 13.2 s


In [26]:
STAT_Descriptions=[
('SortedVals', 'Sample of values', 'vector whose length varies between measurements'),
 ('UnDef', 'sample of number of undefs per row', 'vector whose length varies between measurements'),
 ('mean', 'mean value', ()),
 ('std', 'std', ()),
 ('low100', 'bottom 1%', ()),
 ('high100', 'top 1%', ()),
 ('low1000', 'bottom 0.1%', ()),
 ('high1000', 'top 0.1%', ()),
 ('E', 'Sum of values per day', (365,)),
 ('NE', 'count of values per day', (365,)),
 ('Mean', 'E/NE', (365,)),
 ('O', 'Sum of outer products', (365, 365)),
 ('NO', 'counts for outer products', (365, 365)),
 ('Cov', 'O/NO', (365, 365)),
 ('Var', 'The variance per day = diagonal of Cov', (365,)),
 ('eigval', 'PCA eigen-values', (365,)),
 ('eigvec', 'PCA eigen-vectors', (365, 365))
  ]

In [27]:
US_Weather_parquet='/weather.parquet'
measurements=['TMAX','TMIN','TOBS','SNOW','SNWD','PRCP']
Query="SELECT * FROM parquet.`%s`\n\tWHERE "%US_Weather_parquet+"\n\tor ".join(["measurement='%s'"%m for m in measurements])
print Query

SELECT * FROM parquet.`/weather.parquet`
	WHERE measurement='TMAX'
	or measurement='TMIN'
	or measurement='TOBS'
	or measurement='SNOW'
	or measurement='SNWD'
	or measurement='PRCP'


In [28]:
t=time()
df = sqlContext.sql(Query).cache()
print(df.count()
print 'took',time()-t,'seconds'

TypeError: 'module' object is not callable

In [None]:
print df.count()

In [None]:

t=time()

N=sc.defaultParallelism
print 'Number of executors=',N
rdd0=df.rdd.map(lambda row:(str(row['station']),((str(row['measurement'])\
                        ,row['year']),np.array([np.float64(row[str(i)]) for i in range(1,366)])))).cache()#.repartition(N)
print 'took',time()-t,'seconds'

In [None]:
import numpy as np
def F(row):
    return (str(row['station']),((str(row['measurement'])\
                        ,row['year']),np.array([np.float64(row[str(i)]) for i in range(1,366)])))
row,=df.take(1)
#print F(row)

In [None]:
t=time()
print rdd0.count()
print 'took',time()-t,'seconds'

In [None]:
t=time()
print rdd0.count()
print 'took',time()-t,'seconds'

In [None]:
t=time()
print rdd0.repartition(N).count()
print 'took',time()-t,'seconds'

In [None]:
# Compute the overall distribution of values and the distribution of the number of nan per year
def find_percentiles(SortedVals,percentile):
  L=len(SortedVals)/percentile
  return SortedVals[L],SortedVals[-L]
  
def computeOverAllDist(rdd0):
  UnDef=np.array(rdd0.map(lambda row:sum(np.isnan(row))).sample(False,0.01).collect())
  flat=rdd0.flatMap(lambda v:list(v)).filter(lambda x: not np.isnan(x)).cache()
  count,S1,S2=flat.map(lambda x: np.float64([1,x,x**2]))\
                  .reduce(lambda x,y: x+y)
  mean=S1/count
  std=np.sqrt(S2/count-mean**2)
  Vals=flat.sample(False,0.0001).collect()
  SortedVals=np.array(sorted(Vals))
  low100,high100=find_percentiles(SortedVals,100)
  low1000,high1000=find_percentiles(SortedVals,1000)
  return {'UnDef':UnDef,\
          'mean':mean,\
          'std':std,\
          'SortedVals':SortedVals,\
          'low100':low100,\
          'high100':high100,\
          'low1000':low100,\
          'high1000':high1000
          }

In [None]:
from numpy import linalg as LA

STAT={}  # dictionary storing the statistics for each measurement
Clean_Tables={}

for meas in measurements:
  t=time()
  Query="SELECT * FROM parquet.`%s`\n\tWHERE measurement = '%s'"%(US_Weather_parquet,meas)
  print Query
  df = sqlContext.sql(Query)
  rdd0=df.rdd.map(lambda row:(row['station'],((row['measurement'],row['year']),np.array([np.float64(row[str(i)]) for i in range(1,366)])))).cache()

  rdd1=rdd0.sample(False,1)\
           .map(lambda (key,val): val[1])\
           .cache()\
           .repartition(N)
  print rdd1.count()

  #get basic statistics
  STAT[meas]=computeOverAllDist(rdd1)   # Compute the statistics 
  low1000 = STAT[meas]['low1000']  # unpack the extreme values statistics
  high1000 = STAT[meas]['high1000']

  #clean up table from extreme values and from rows with too many undefinde entries.
  rdd2=rdd1.map(lambda V: np.array([x if (x>low1000-1) and (x<high1000+1) else np.nan for x in V]))
  rdd3=rdd2.filter(lambda row:sum(np.isnan(row))<50)
  Clean_Tables[meas]=rdd3.cache().repartition(N)
  C=Clean_Tables[meas].count()
  print 'for measurement %s, we get %d clean rows'%(meas,C)

  # compute covariance matrix
  OUT=computeCov(Clean_Tables[meas])

  #find PCA decomposition
  eigval,eigvec=LA.eig(OUT['Cov'])

  # collect all of the statistics in STAT[meas]
  STAT[meas]['eigval']=eigval
  STAT[meas]['eigvec']=eigvec
  STAT[meas].update(OUT)

  # print summary of statistics
  print 'the statistics for %s consists of:'%meas
  for key in STAT[meas].keys():
    e=STAT[meas][key]
    if type(e)==list:
      print key,'list',len(e)
    elif type(e)==np.ndarray:
      print key,'ndarray',e.shape
    elif type(e)==np.float64:
      print key,'scalar'
    else:
      print key,'Error type=',type(e)
  print 'time for',meas,'is',time()-t

In [None]:
from pickle import dump
def dumpS3(object,S3dir,filename):
    dump(object,open(filename,'wb'))
    !ls -l $filename
    s3helper.local_to_s3(filename, S3dir+filename)
dumpS3((STAT,STAT_Descriptions),'/Weather/','STAT1.pickle')


In [None]:
STAT.keys()

### Sample Stations
Generate a sample of stations, for each one store all available year X measurement pairs.

In [None]:
rdd0.take(10) # test output

In [None]:
groups=rdd0.groupByKey().cache()
print 'number of stations=',groups.count()

groups1=groups.sample(False,0.01).collect()
groups2=[(g[0],[e for e in g[1]]) for g in groups1]

dumpS3(groups2,'/Weather/','SampleStations.pickle')

In [None]:
group_Sample=rdd0.sample(False,0.001).groupByKey().mapValues(list).cache()
group_Sample.first()

In [None]:
Means=rdd0.aggregateByKey((np.zeros(365),1),\
                          lambda S,D: sumWithNan(S,(D[1],1)),\
                          lambda S1,S2: sumWithNanithNan(S1,S2))\
.cache()#.repartition(N)

In [None]:
Means.take(5)

In [None]:
groups=rdd0.groupByKey().cache()
print 'number of stations=',groups.count()
