## Computing PCA using RDDs

In [1]:
import findspark
findspark.init()
from pyspark import SparkContext

#sc.stop()
sc = SparkContext(master="local[3]",pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStats.py'])

from pyspark.sql import *
sqlContext = SQLContext(sc)

In [2]:
import sys
sys.path.append('./lib')

import numpy as np
from numpy_pack import packArray,unpackArray
from spark_PCA import computeCov
from computeStats import computeOverAllDist, STAT_Descriptions

In [3]:
file_index='BBBBSSBB'
data_dir='../../Data/Weather'

filebase='US_Weather_%s'%file_index
u_filename=filebase+'.csv'
!ls -lh $data_dir/$u_filename

curl https://mas-dse-open.s3.amazonaws.com/Weather/small/US_Weather_BBBBSSBB.csv.gz > ../../Data/Weather/US_Weather_BBBBSSBB.csv.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 3857k  100 3857k    0     0  3022k      0  0:00:01  0:00:01 --:--:-- 3255k
-rw-r--r--  1 jc  staff   3.8M May 15 15:51 ../../Data/Weather/US_Weather_BBBBSSBB.csv.gz


In [4]:
import pickle
List=pickle.load(open(data_dir+'/'+u_filename,'rb'))
len(List)

12884

In [5]:
df=sqlContext.createDataFrame(List)
print df.count()
df.show(5)

12884
+---------+--------+---------+-----------+-----------+------+--------------------+------+--------+
|elevation|latitude|longitude|measurement|    station|undefs|              vector|  year|   label|
+---------+--------+---------+-----------+-----------+------+--------------------+------+--------+
|    106.7| 43.4622| -76.4933|       TMAX|USC00306314|     0|[20 50 40 4C 80 4...|1926.0|BBBBSSBB|
|    106.7| 43.4622| -76.4933|       TMAX|USC00306314|     0|[00 00 A0 D3 00 0...|1927.0|BBBBSSBB|
|    106.7| 43.4622| -76.4933|       TMAX|USC00306314|     1|[80 4D 80 D4 E0 D...|1928.0|BBBBSSBB|
|    106.7| 43.4622| -76.4933|       TMAX|USC00306314|     1|[00 53 00 00 00 C...|1929.0|BBBBSSBB|
|    106.7| 43.4622| -76.4933|       TMAX|USC00306314|     2|[E0 54 E0 55 00 4...|1930.0|BBBBSSBB|
+---------+--------+---------+-----------+-----------+------+--------------------+------+--------+
only showing top 5 rows



In [7]:
# Compare file sizes
!du -sh $data_dir/$filebase*

 13M	../../Data/Weather/US_Weather_BBBBSSBB.csv
3.8M	../../Data/Weather/US_Weather_BBBBSSBB.csv.gz
4.8M	../../Data/Weather/US_Weather_BBBBSSBB.parquet


In [9]:
measurements=['TMAX', 'SNOW', 'SNWD', 'TMIN', 'PRCP', 'TOBS']

In [10]:
sqlContext.registerDataFrameAsTable(df,'weather') #using older sqlContext instead of newer (V2.0) sparkSession

In [11]:
from numpy import linalg as LA
STAT={}  # dictionary storing the statistics for each measurement
Clean_Tables={}

for meas in measurements:
    t=time()
    Query="SELECT * FROM weather\n\tWHERE measurement = '%s'"%(meas)
    print Query
    df = sqlContext.sql(Query)
    data=df.rdd.map(lambda row: unpackArray(row['vector'],np.float16))
    #get very basic statistics
    STAT[meas]=computeOverAllDist(data)   # Compute the statistics 

    # compute covariance matrix
    OUT=computeCov(data)

    #find PCA decomposition
    eigval,eigvec=LA.eig(OUT['Cov'])

    # collect all of the statistics in STAT[meas]
    STAT[meas]['eigval']=eigval
    STAT[meas]['eigvec']=eigvec
    STAT[meas].update(OUT)

    print 'time for',meas,'is',time()-t

SELECT * FROM weather
	WHERE measurement = 'TMAX'
shape of E= (365,) shape of NE= (365,)
time for TMAX is 23.4791741371
SELECT * FROM weather
	WHERE measurement = 'SNOW'
shape of E= (365,) shape of NE= (365,)
time for SNOW is 24.409812212
SELECT * FROM weather
	WHERE measurement = 'SNWD'
shape of E= (365,) shape of NE= (365,)
time for SNWD is 16.1829910278
SELECT * FROM weather
	WHERE measurement = 'TMIN'
shape of E= (365,) shape of NE= (365,)
time for TMIN is 21.6037950516
SELECT * FROM weather
	WHERE measurement = 'PRCP'
shape of E= (365,) shape of NE= (365,)
time for PRCP is 33.3207120895
SELECT * FROM weather
	WHERE measurement = 'TOBS'
shape of E= (365,) shape of NE= (365,)
time for TOBS is 12.7809820175


In [13]:
STAT2={} 
elev=300
tvars = (('UTOBS','>'),('LTOBS','<='))

for t in tvars:
    Query="select * from weather where measurement = '{0}' "\
          "and elevation {1} {2}".format('TOBS',t[1],elev)
    df2 = sqlContext.sql(Query)
    data2=df2.rdd.map(lambda row: unpackArray(row['vector'],np.float16))
    # Compute the statistics 
    STAT2[t[0]]=computeOverAllDist(data2)   
    
    # compute covariance matrix
    OUT2=computeCov(data2)

    #find PCA decomposition
    eigval,eigvec=LA.eig(OUT2['Cov'])

    # collect all of the statistics in STAT[meas]
    STAT2[t[0]]['eigval']=eigval
    STAT2[t[0]]['eigvec']=eigvec
    STAT2[t[0]].update(OUT2)

shape of E= (365,) shape of NE= (365,)
shape of E= (365,) shape of NE= (365,)


In [14]:
from pickle import dump
filename=data_dir+'/STAT_%s.pickle'%file_index
dump((STAT,STAT_Descriptions),open(filename,'wb'))

# for upper and lower TOBS
filename2=data_dir+'/STAT2_%s.pickle'%file_index
dump((STAT2,STAT_Descriptions),open(filename2,'wb'))