In [None]:
file_index = 'SBBSSBSS'
data_dir = '../../Data/Weather'
measurements = ['TMIN', 'TMAX', 'TOBS', 'PRCP', 'SNOW', 'SNWD']

In [None]:
import sys
sys.path.append('./lib')
import math
import pickle
import numpy as np
import pandas as pd
from pickle import load
from numpy import linalg as LA
from scipy.special import gammaln, factorial
from sklearn.decomposition import PCA
%pylab inline
from ipyleaflet import (
    Map,
    Marker,
    TileLayer, ImageOverlay,
    Polyline, Polygon, Rectangle, Circle, CircleMarker,
    GeoJSON,
    DrawControl
)

from numpy_pack import packArray, unpackArray
from spark_PCA import computeCov
from computeStats import computeOverAllDist, STAT_Descriptions
from YearPlotter import YearPlotter
from Eigen_decomp import Eigen_decomp
from recon_plot import recon_plot

In [None]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import *
#sc.stop()
sc = SparkContext(master="local[3]",pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStats.py','lib/recon_plot.py','lib/Eigen_decomp.py'])
sqlContext = SQLContext(sc)

In [None]:
# from pickle import dump

# c_filename = 'US_Weather_%s.csv.gz'%file_index
# u_filename = 'US_Weather_%s.csv'%file_index

# url = "https://mas-dse-open.s3.amazonaws.com/Weather/small"
# !curl $url/$c_filename > $data_dir/$c_filename 
# !gunzip -c $data_dir/$c_filename > $data_dir/$u_filename

# List = pickle.load(open(data_dir+'/'+u_filename,'rb'))
# df = sqlContext.createDataFrame(List)
# print df.count()
# df.write.save(data_dir+'/US_Weather_'+file_index+'.parquet')

# sqlContext.registerDataFrameAsTable(df,'weather')
# STAT = {}
# for meas in measurements:
#     Query = "SELECT * FROM weather\n\tWHERE measurement = '%s'"%meas
#     print Query
#     df = sqlContext.sql(Query)
#     data = df.rdd.map(lambda row: unpackArray(row['vector'],np.float16))
#     STAT[meas] = computeOverAllDist(data) 
#     OUT = computeCov(data)
#     eigval,eigvec = LA.eig(OUT['Cov'])
#     STAT[meas]['eigval'] = eigval
#     STAT[meas]['eigvec'] = eigvec
#     STAT[meas].update(OUT)

# filename = data_dir+'/STAT_%s.pickle'%file_index
# dump((STAT,STAT_Descriptions),open(data_dir+'/STAT_'+file_index+'.pickle','wb'))


In [None]:
# read data
filename = data_dir+'/STAT_%s.pickle'%file_index
STAT,STAT_Descriptions = load(open(filename,'rb'))
filename = data_dir+'/US_Weather_%s.parquet'%file_index
df = sqlContext.read.parquet(filename)
print df.count()

In [None]:
# sanity check
months = [(0,31), (31,59), (59,90), (90,120), (120,151), (151,181), \
          (181,212), (212,243), (243,273), (273,304), (304,334), (334,365)]
for i in months:
    print '%.2f\t%.2f\t%.2f\t%.2f'%(sum(STAT['TMIN']['Mean'][i[0]:i[1]])/(i[1]-i[0])/10, \
                                    sum(STAT['TOBS']['Mean'][i[0]:i[1]])/(i[1]-i[0])/10, \
                                    sum(STAT['TMAX']['Mean'][i[0]:i[1]])/(i[1]-i[0])/10, \
                                    sum(STAT['PRCP']['Mean'][i[0]:i[1]])/10)
print sum(STAT['TMIN']['Mean'])/3650, sum(STAT['TMAX']['Mean'])/3650, sum(STAT['TOBS']['Mean'])/3650, \
      sum(STAT['PRCP']['Mean'])/10, sum(STAT['SNOW']['Mean'])/10

In [None]:
# plot mean+-std
fig,axes = plt.subplots(3, 2, figsize=(16,18))
for i in range(6):
    meas = measurements[i]
    mean = STAT[meas]['Mean']
    std = np.sqrt(STAT[meas]['Var'])
    graph = np.vstack([(mean-std)/10,mean/10,(mean+std)/10]).transpose()
    YearPlotter().plot(graph, fig, axes[i/2,i%2], title='mean+-std '+meas, labels=['mean-std','mean','mean+std'])

In [None]:
# plot eigenvalues
fig,axes = plt.subplots(2, 3, sharex=True, figsize=(16,10))
for i in range(6):
    meas = measurements[i]
    e_value = STAT[meas]['eigval']
    print sum(e_value[:10])/sum(e_value)
    subplot(2, 3, i+1)
    plot(([0,]+list(cumsum(e_value[:10])))/sum(e_value))
    title('percentage of variance explained for '+ meas)
    ylabel('percentage of variance')
    xlabel('# eigenvector')
    grid()

In [None]:
# plot mean and top 5 eigenvectors
meas = 'SNWD'
fig,axes = plt.subplots(2, 1, figsize=(10,6))
mean = STAT[meas]['Mean']
e_vec = np.matrix(STAT[meas]['eigvec'][:,:3])
YearPlotter().plot(mean/10, fig, axes[0], label='mean', title='mean '+meas)
YearPlotter().plot(e_vec, fig, axes[1], labels=['eig1','eig2','eig3'], title='top 3 eigenvectors '+meas)

In [None]:
# plot three station+year reconstruction by descending/ascending order
meas = 'SNWD'
mean = STAT[meas]['Mean']
e_vec = [STAT[meas]['eigvec'][:,i] for i in range(3)]

def decompose(row):
    Series = np.array(unpackArray(row.vector,np.float16), dtype=np.float64)
    recon = Eigen_decomp(None, Series/10, mean/10, e_vec);
    total_var, residuals, reductions, coeff = recon.compute_var_explained()
    residuals = [float(r) for r in residuals[1]]
    coeff = [float(r) for r in coeff[1]]
    D = row.asDict()
    D['total_var'] = float(total_var[1])
    D['res_mean'] = residuals[0]
    for i in range(1,len(residuals)):
        D['res_'+str(i)] = residuals[i]
        D['coeff_'+str(i)] = coeff[i-1]
    return Row(**D)

rdd2 = df.rdd.filter(lambda row: row['measurement']==meas).map(decompose)
df2 = sqlContext.createDataFrame(rdd2)
df2 = df2.select('coeff_1','coeff_2','coeff_3','res_1','res_2','res_3','res_mean','vector') \
         .filter(df2.res_mean<0.99).filter(df2.res_1<0.99).filter(df2.res_2<0.99).filter(df2.res_3<0.99)

def plot_recon_grid(which, ascending):
    coeff = 'coeff_'+str(which)
    res = 'res_'+str(which)
    rows = df2.sort(coeff, ascending=ascending).take(4)
    fig,axes = plt.subplots(1, 4, figsize=(16,4))
    for i in range(4):
        row = rows[i]
        graph = [np.array(unpackArray(row.vector,np.float16), dtype=np.float64)/10, mean/10]
        graph += [graph[-1] + row.coeff_1 * e_vec[0]]
        graph += [graph[-1] + row.coeff_2 * e_vec[1]]
        graph += [graph[-1] + row.coeff_3 * e_vec[2]]
        graph = np.vstack(graph).transpose()
        title = 'c%d=%.2f r%d=%.2f'%(which, row[coeff], which, row[res])
        YearPlotter().plot(graph, fig, axes[i], labels=['target','mean','c1','c2','c3'], title=title)

In [None]:
plot_recon_grid(1, False)
plot_recon_grid(1, True)
plot_recon_grid(2, False)
plot_recon_grid(2, True)
plot_recon_grid(3, False)
plot_recon_grid(3, True)

In [None]:
# plot the CDF of coeff/res
def plot_CDF(feature):
    fig,axes = plt.subplots(1, 3, figsize=(16,5))
    P = np.arange(0, 1, 1./df2.count())
    if len(P) > df2.count():
        P = P[:-1]
    for i in range(3):
        feat = feature+'_'+str(i+1)
        rows = df2.select(feat).sort(feat).collect()
        vals = [r[feat] for r in rows]
        subplot(1, 3, i+1)
        plot(vals,P)
        title('cumulative distribution of '+feat)
        ylabel('number of instances')
        xlabel(feat)
        grid()

plot_CDF('coeff')
plot_CDF('res')

In [None]:
# print minus MS of SNWD
meas = 'SNWD'
mean = STAT[meas]['Mean']
e_vec = [STAT[meas]['eigvec'][:,i] for i in range(4)]

rdd6 = df.rdd.filter(lambda row: row['measurement']==meas).map(decompose)
df6 = sqlContext.createDataFrame(rdd6)
df6 = df6.filter(df6.res_mean<0.999).filter(df6.res_1<0.999)
    
def MS(Mat):
    return np.nanmean(Mat**2)

for i in range(4):
    coef = 'coeff_%d'%(i+1)
    year_station_table = df6.select('station', 'year', coef).toPandas() \
                            .pivot(index='year', columns='station', values=coef)
    mean_by_year = np.nanmean(year_station_table,axis=1)
    mean_by_station = np.nanmean(year_station_table,axis=0)
    tbl_minus_year = (year_station_table.transpose()-mean_by_year).transpose()
    tbl_minus_station = year_station_table-mean_by_station
    print 'total MS                    = ', MS(year_station_table)
    var_e = 1 - MS(tbl_minus_station) / MS(year_station_table)
    print 'MS removing mean-by-station = ', MS(tbl_minus_station), 'variance explained = ', var_e
    var_e = 1 - MS(tbl_minus_year) / MS(year_station_table)
    print 'MS removing mean-by-year    = ', MS(tbl_minus_year), 'variance explained = ', var_e

In [None]:
# plot P_norm
def fillnans(l):
    x = np.zeros(l)
    x.fill(np.nan)
    return x

meas = 'PRCP'
rows = df.rdd.filter(lambda row: row['measurement']==meas) \
             .map(lambda row:(row.station,row.year,unpackArray(row['vector'],np.float16))).collect()
years = set([r[1] for r in rows])
days = int((max(years)-min(years)+1)*365)
stations = sorted(list(set([r[0] for r in rows])))
A_dict = {st: fillnans(days) for st in stations}
for i in range(len(rows)):
    row = rows[i]
    loc = int((row[1]-min(years))*365)
    A_dict[row[0]][loc:loc+365] = row[2]
A_list = [A_dict[st] for st in stations]
A = np.hstack([A_list])

def G(n):
    return gammaln(n+1)

def LogProb(m,l,n1,n2):
    logP = -G(l)-G(n1-l)-G(n2-l)-G(m-n1-n2+l)-G(m)+G(n1)+G(m-n1)+G(n2)+G(m-n2)
    return logP / m

def computeLogProb(X, Y):
    X[np.isnan(Y)] = np.nan
    Y[np.isnan(X)] = np.nan
    G = ~isnan(X)
    m = sum(G)
    XG = X[G]>0
    YG = Y[G]>0
    n1 = sum(XG)
    n2 = sum(YG)
    l = sum(XG*YG)
    logprob = LogProb(m,l,n1,n2)
    return logprob, m

l_stations = len(stations)
Length = np.zeros([l_stations,l_stations])
P_norm = np.zeros([l_stations,l_stations])
for i in range(l_stations):
    for j in range(l_stations):
        if i==j: 
            P_norm[i,j] = -0.4
            continue
        X = copy(A_dict[stations[i]])
        Y = copy(A_dict[stations[j]])
        P_norm[i,j], Length[i,j] = computeLogProb(X,Y)
        if Length[i,j] < 200:
            P_norm[i,j] = np.nan

P_norm_flat = P_norm.flatten();
P_norm_flat_nn = P_norm_flat[~isnan(P_norm_flat)]
hist(-P_norm_flat_nn, bins=100);
xlabel('significance')            

P_norm0 = np.nan_to_num(P_norm)
pca = PCA(n_components=3, svd_solver='full')
pca.fit(P_norm0)
fig,axes = plt.subplots(2, 2, figsize=(16,16))
axes[0,0].imshow(P_norm, cmap=plt.cm.gray)
axes[0,0].set_title('P_norm original')
for i in range(3):
    order = np.argsort(pca.components_[i,:])
    P_norm_reord = P_norm0[order,:]
    P_norm_reord = P_norm_reord[:,order]
    axes[i>0,(i+1)%2].matshow(P_norm_reord)
    axes[i>0,(i+1)%2].set_title('P_norm reodered by eig%d'%(i+1))

fig,axes = plt.subplots(1, 3, figsize=(16,5))
P_norm_eig = list(pca.components_.transpose())
for i in range(3):
    P_norm_eig_order = sorted(P_norm_eig, key=lambda x:x[i])
    subplot(1, 3, i+1)
    plot(P_norm_eig_order)
    title('top 3 eigenvectors ordered by eig%d'%(i+1))

In [None]:
# plot map of temperature
meas = 'TOBS'
mean = STAT[meas]['Mean']
e_vec = [STAT[meas]['eigvec'][:,i] for i in range(4)]
rdd3 = df.rdd.filter(lambda row: row['measurement']==meas).map(decompose)
df3 = sqlContext.createDataFrame(rdd3)
df3 = df3.filter(df3.res_1<0.99).filter(df3.res_2<0.99).filter(df3.res_3<0.99).filter(df3.res_4<0.99) \
         .select('station','longitude','latitude','coeff_1','coeff_2','coeff_3','coeff_4') \
         .groupby(['station','longitude','latitude']) \
         .agg({'coeff_1':'mean', 'coeff_2':'mean', 'coeff_3':'mean', 'coeff_4':'mean'})

st_dict = df3.rdd.map(lambda row: ((row[0],row[1],row[2]),(row[3],row[4],row[5],row[6]))).collectAsMap()

In [None]:
prop_cycle = plt.rcParams['axes.prop_cycle']
colors = prop_cycle.by_key()['color']

coeffs = [st_dict[st] for st in st_dict]
coeffs_absmax = np.max(np.abs(coeffs), axis=0)

longitudes = [st[1] for st in st_dict]
min_long, max_long = min(longitudes)-0.1, max(longitudes)+0.1
latitudes = [st[2] for st in st_dict]
min_lat, max_lat = min(latitudes)-0.1, max(latitudes)+0.1
center = [(min_lat+max_lat)/2, (min_long+max_long)/2]
myMap = Map(default_tiles=TileLayer(opacity=0.6), center=center, zoom=7)
myRect = Rectangle(bounds=[[min_lat,min_long],[max_lat,max_long]], weight=5, fill_opacity=0.0)
myMap += myRect

for st in st_dict:
    _long = st[1]
    _lat = st[2]
    for i in range(4):
        _coeff = st_dict[st][i]
        if np.isnan(_coeff):
            continue
        r = _coeff / coeffs_absmax[i] / 5
        color = colors[i]
        signs = [[-1,+1],[+1,+1],[-1,-1],[+1,-1]]
        long_sign, lat_sign = signs[i]
        if _coeff >= 0:
            triangle = [(_lat,_long),(_lat+lat_sign*r,_long),(_lat,_long+long_sign*r),(_lat,_long)]
            myPoly = Polygon(locations=triangle, weight=0, color=color, opacity=0, fill_opacity=0.7, fill_color=color)
        else:
            triangle = [(_lat,_long),(_lat-lat_sign*r,_long),(_lat,_long-long_sign*r),(_lat,_long)]
            myPoly = Polygon(locations=triangle, weight=2, color=color, opacity=0.8, fill_opacity=0, fill_color=color)
        myMap += myPoly
myMap   

In [None]:
# monthly PRCP
m = 'PRCP'
def sum_over_month(row):
    months = [(0,31), (31,59), (59,90), (90,120), (120,151), (151,181), \
              (181,212), (212,243), (243,273), (273,304), (304,334), (334,365)]
    vec = np.array(unpackArray(row.vector, np.float16), dtype=np.float64)
    vec_month = np.array([sum(vec[i[0]:i[1]]) for i in months])
    return vec_month

rdd4 = df.rdd.filter(lambda row: row['measurement']==m).map(lambda row: sum_over_month(row))
prcp_m = np.vstack(rdd4.collect())

mean = np.nanmean(prcp_m, axis=0)
std = np.nanstd(prcp_m, axis=0)
pca_prcp = PCA(n_components=12, svd_solver='full')
pca_prcp.fit(np.nan_to_num(prcp_m))
eigvec_prcp = pca_prcp.components_
eigval_prcp = pca_prcp.explained_variance_

P = range(1,13)
Months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Des']

plt.plot(P, (mean-std)/10, label='mean-std')
plt.plot(P, mean/10, label='mean')
plt.plot(P, (mean+std)/10, label='mean+std')
title('mean+-std of monthly '+ m)
xticks(P, Months)
legend()
grid()

fig,axes = plt.subplots(1, 2, figsize=(16,7))

subplot(1,2,1)
plot([0] + P, ([0] + list(cumsum(eigval_prcp))) / sum(eigval_prcp))
title('% of variance explained for monthly '+ m)
ylabel('percentage of variance')
xlabel('# eigenvector')
grid()

plt.subplot(1,2,2)
for i in range(8):
    plot(P, eigvec_prcp[i], label='eig%d'%(i+1))
title('top 8 eigenvectors of monthly ' + m)
xticks(P, Months)
legend()
grid()

In [None]:
def decompose2(Series):
    recon = Eigen_decomp(None, Series/10, mean/10, eigvec_prcp);
    total_var, residuals, reductions, coeff = recon.compute_var_explained()
    residuals = [float(r) for r in residuals[1]]
    coeff = [float(r) for r in coeff[1]]
    D = {}
    for i in range(1,len(residuals)):
        D['res_'+str(i)] = residuals[i]
        D['coeff_'+str(i)] = coeff[i-1]
    return Row(**D)

rdd5 = rdd4.map(decompose2)
df5 = sqlContext.createDataFrame(rdd5)
df5 = df5.filter(df5.res_1<0.99).filter(df5.res_2<0.99).filter(df5.res_3<0.99).filter(df5.res_4<0.99) \
         .filter(df5.res_5<0.99).filter(df5.res_6<0.99).filter(df5.res_7<0.99).filter(df5.res_8<0.99)

def plot_CDF2(feature):
#     fig,axes = plt.subplots(2, 4, figsize=(16,9))
    P = np.arange(0, 1, 1./df5.count())
    if len(P) > df5.count():
        P = P[:-1]
    for i in range(8):
        feat = feature+'_'+str(i+1)
        rows = df5.select(feat).sort(feat).collect()
        vals = [r[feat] for r in rows]
#         subplot(2,4,i+1)
        plot(vals, P, label='res_%d'%(i+1))
    title('cumulative distribution of residual variance')
    ylabel('number of instances')
#     xlabel(feat)
    grid()
    legend()

# plot_CDF2('coeff')
plot_CDF2('res')