# Spacial visualization of eigenvalues

Below we plot the coefficients of the top eigenvalues for each station, to see if any patterns emerge

In [None]:
import os.path as path
import sys
import numpy as np
from pyspark import SparkContext, SparkConf
from lib import *
from pyspark.sql import *
# from utils import *
from lib.utils import *
import findspark
findspark.init()

%pylab inline


In [None]:
# Create a new spark and sql context
sc = create_sc(pyFiles=['lib/numpy_pack.py','lib/spark_PCA.py','lib/computeStatistics.py'])
sqlContext = SQLContext(sc)

# Create the names of the files/directories we're working with
data_dir = '../DataHW3'

if not path.exists(data_dir + '/' + 'stations.parquet'):
    getStations()

    
states = ['ND', 'SD', 'MN', 'IA', 'NE', 'TX', 'OK', 'KS']

for s in states:
    
    parquet = s + '.parquet'
    tarname = s + '.tgz'
    
    if not path.exists(data_dir + '/' + parquet):

        # pull the weather data for a particular state from the MAS-DSE S3 bucket
        getStateData(s, data_dir, tarname, parquet)

In [None]:
parquet = 'decon_midwest_SNWD.parquet'
parquet_path = parquet
df = sqlContext.read.parquet(parquet_path)

sqlContext.registerDataFrameAsTable(df,'table')

Query = f"""
SELECT Station, Year, coeff_1, longitude, latitude
FROM table
WHERE state == \'ND\'
"""
dframe = sqlContext.sql(Query)
dframe.count()

## Coefficient 1

In [None]:
featureStr = 'coeff_1'

plotter = leaflet_eig(sqlContext, featureStr)

sqlContext.registerDataFrameAsTable(df.drop('Values').withColumnRenamed(featureStr, 'Values'),f'table_{featureStr}')

Query = f"""
SELECT Station, Year, Values, longitude, latitude
FROM table_{featureStr}
"""
dframe = sqlContext.sql(Query)

plotter.add(dframe)
plotter.plot_all()
plotter.m

### Filtered by year

In [None]:
featureStr = 'coeff_1'

plotter = leaflet_eig(sqlContext, featureStr)

sqlContext.registerDataFrameAsTable(df.drop('Values').withColumnRenamed(featureStr, 'Values'),f'table_{featureStr}')

Query = f"""
SELECT Station, Year, Values, longitude, latitude
FROM table_{featureStr}
WHERE Year>1969 and Year<2010
"""
dframe = sqlContext.sql(Query)

plotter.add(dframe)
plotter.plot_all()
plotter.m

In [None]:
plotter.color_legend()

## Coefficient 2

In [None]:
featureStr = 'coeff_2'

plotter = leaflet_eig(sqlContext, featureStr)

sqlContext.registerDataFrameAsTable(df.drop('Values').withColumnRenamed(featureStr, 'Values'),f'table_{featureStr}')

Query = f"""
SELECT Station, Year, Values, longitude, latitude
FROM table_{featureStr}
"""
dframe = sqlContext.sql(Query)

plotter.add(dframe)
plotter.plot_all()
plotter.m

In [None]:
plotter.color_legend()

## Coefficient 3

In [None]:
featureStr = 'coeff_3'

plotter = leaflet_eig(sqlContext, featureStr)


sqlContext.registerDataFrameAsTable(df.drop('Values').withColumnRenamed(featureStr, 'Values'),f'table_{featureStr}')

Query = f"""
SELECT Station, Year, Values, longitude, latitude
FROM table_{featureStr}
"""
dframe = sqlContext.sql(Query)

plotter.add(dframe)
plotter.plot_all()
plotter.m

In [None]:
plotter.color_legend()

## Total Var

In [None]:
featureStr = 'total_var'

plotter = leaflet_eig(sqlContext, featureStr)

sqlContext.registerDataFrameAsTable(df.drop('Values').withColumnRenamed(featureStr, 'Values'),f'table_{featureStr}')

Query = f"""
SELECT Station, Year, Values, longitude, latitude
FROM table_{featureStr}
"""
dframe = sqlContext.sql(Query)

plotter.add(dframe)
plotter.plot_all()
plotter.m

In [None]:
plotter.color_legend()

In [None]:
df.count()