In [1]:
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = (12.0, 6.0)

import numpy as np
import math
import sys
import subprocess
import seaborn as sns
from fbprophet import Prophet

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

pd.set_option('display.float_format', lambda x: '%.5f' % x) # pandas
np.set_printoptions(precision=5, suppress=True) # numpy

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

# seaborn plotting style
sns.set(style='ticks', context='poster')

In [2]:
station_cls = pd.read_csv('station_cluster.csv')
station_cls.head()

Unnamed: 0.1,Unnamed: 0,Station,Cluster,Lat,Long,Centroid_Dist
0,0,CARLSBAD/PALOMAR,11,33.133,-117.283,2.54477
1,1,SACRAMENTO EXECUTIVE ARPT,3,38.5,-121.5,3.01334
2,2,TUSTIN MCAF,2,33.7,-117.833,4.67815
3,3,MATHER FIELD,13,38.55,-121.3,2.03201
4,4,REDDING MUNICIPAL ARPT,9,40.517,-122.317,2.85041


In [3]:
ca_meta = pd.read_csv('CA_weather_stations.csv')
ca_meta.head()

Unnamed: 0.1,Unnamed: 0,USAF,STATION,CLASS,latitude,longitude
0,0,690140,EL TORO MCAS,3,33.667,-117.733
1,1,690150,TWENTYNINE PALMS,2,34.3,-116.167
2,2,690160,TUSTIN MCAF,3,33.7,-117.833
3,363,722810,NAF,3,32.817,-115.683
4,364,722860,MARCH AFB,2,33.9,-117.25


In [4]:
stations = pd.merge(station_cls, ca_meta, left_on='Station', right_on='STATION')

In [5]:
stations.head()

Unnamed: 0,Unnamed: 0_x,Station,Cluster,Lat,Long,Centroid_Dist,Unnamed: 0_y,USAF,STATION,CLASS,latitude,longitude
0,0,CARLSBAD/PALOMAR,11,33.133,-117.283,2.54477,386,722927,CARLSBAD/PALOMAR,2,33.133,-117.283
1,1,SACRAMENTO EXECUTIVE ARPT,3,38.5,-121.5,3.01334,757,724830,SACRAMENTO EXECUTIVE ARPT,1,38.5,-121.5
2,2,TUSTIN MCAF,2,33.7,-117.833,4.67815,2,690160,TUSTIN MCAF,3,33.7,-117.833
3,3,MATHER FIELD,13,38.55,-121.3,2.03201,758,724833,MATHER FIELD,3,38.55,-121.3
4,4,REDDING MUNICIPAL ARPT,9,40.517,-122.317,2.85041,1045,725920,REDDING MUNICIPAL ARPT,1,40.517,-122.317


In [6]:
stations.drop(columns=['Unnamed: 0_x', 'STATION', 'latitude', 'longitude', 'Unnamed: 0_y'], inplace=True)
stations.head()

Unnamed: 0,Station,Cluster,Lat,Long,Centroid_Dist,USAF,CLASS
0,CARLSBAD/PALOMAR,11,33.133,-117.283,2.54477,722927,2
1,SACRAMENTO EXECUTIVE ARPT,3,38.5,-121.5,3.01334,724830,1
2,TUSTIN MCAF,2,33.7,-117.833,4.67815,690160,3
3,MATHER FIELD,13,38.55,-121.3,2.03201,724833,3
4,REDDING MUNICIPAL ARPT,9,40.517,-122.317,2.85041,725920,1


In [46]:
stations

Unnamed: 0,Station,Cluster,Lat,Long,Centroid_Dist,USAF,CLASS
0,CARLSBAD/PALOMAR,2,33.13300,-117.28300,3.59682,722927,2
1,SACRAMENTO EXECUTIVE ARPT,3,38.50000,-121.50000,5.34634,724830,1
2,TUSTIN MCAF,2,33.70000,-117.83300,5.09995,690160,3
3,MATHER FIELD,3,38.55000,-121.30000,3.16284,724833,3
4,REDDING MUNICIPAL ARPT,5,40.51700,-122.31700,12.86466,725920,1
5,HANFORD MUNI ARPT [ISIS],3,36.31000,-119.63000,4.77649,723898,3
6,CHINO AIRPORT,0,33.96700,-117.63300,7.09038,722899,3
7,LEMOORE REEVES NAS,3,36.33300,-119.95000,3.48593,747020,2
8,PASO ROBLES MUNICIPAL ARPT,3,35.66700,-120.63300,6.49092,723965,2
9,FRESNO YOSEMITE INTL AP,3,36.78300,-119.71700,3.64849,723890,1


In [7]:
df = stations.sort_values(['Cluster', 'CLASS', 'Centroid_Dist'])

In [8]:
df

Unnamed: 0,Station,Cluster,Lat,Long,Centroid_Dist,USAF,CLASS
27,ARCATA AIRPORT,0,40.98300,-124.10000,1.99010,725945,1
95,CRESCENT CITY FAA AI,0,41.78300,-124.23300,2.83127,725946,2
101,MONTEREY NAF,0,36.60000,-121.86700,4.69712,724915,2
88,DAGGETT BARSTOW-DAGGETT AP,1,34.85000,-116.80000,4.63014,723815,1
45,PALMDALE AIRPORT,1,34.63300,-118.08300,2.05264,723820,2
50,EDWARDS AFB,1,34.90000,-117.86700,2.14745,723810,2
49,LANCASTER GEN WM FOX FIELD,1,34.73300,-118.21700,2.36621,723816,2
15,MOJAVE,1,35.06700,-118.15000,3.22195,722953,3
31,BICYCLE LAKE (AAF),1,35.28300,-116.61700,5.33533,746110,3
20,SANDBERG,1,34.75000,-118.71700,6.45239,723830,3


In [9]:
selected_station = pd.DataFrame()
for i in list(sorted(stations.Cluster.unique())):
    selected_station = selected_station.append(df[df.Cluster == i].head(1))

In [10]:
selected_station.rename({'Station':'STATION',
                         'Lat':'latitude',
                         'Long':'longitude'}, axis=1, inplace = True)
selected_station

Unnamed: 0,STATION,Cluster,latitude,longitude,Centroid_Dist,USAF,CLASS
27,ARCATA AIRPORT,0,40.983,-124.1,1.9901,725945,1
88,DAGGETT BARSTOW-DAGGETT AP,1,34.85,-116.8,4.63014,723815,1
18,LONG BEACH DAUGHERTY FLD,2,33.833,-118.167,1.90124,722970,1
1,SACRAMENTO EXECUTIVE ARPT,3,38.5,-121.5,3.01334,724830,1
76,IMPERIAL,4,32.833,-115.583,2.10691,747185,2
99,RIVERSIDE MUNI,5,33.95,-117.45,1.14486,722869,2
24,SOUTH LAKE TAHOE,6,38.9,-120.0,1.03368,725847,2
25,SAN FRANCISCO INTL AP,7,37.617,-122.4,3.99773,724940,1
80,CHINA LAKE NAF,8,35.683,-117.683,0.74062,746120,2
4,REDDING MUNICIPAL ARPT,9,40.517,-122.317,2.85041,725920,1


In [11]:
selected_station = selected_station.reset_index()
selected_station.drop('index', axis = 1, inplace =True)

In [12]:
selected_station['CZ']= pd.Series(['Arcata',
                                   'Palmdale', 
                                   'Torrance',
                                   'Sacramento',
                                   'Palm Spring-Intl',
                                   'Burbank-Glendale',
                                   'Blue Canyon',
                                   'Oakland',
                                   'China Lake',
                                   'Santa Rosa',
                                   'Santa Maria',
                                   'San Diego-Lindbergh',
                                   'Santa Barbaba',
                                   'Fresno',
                                   'Palm Spring',
                                   'Red Bluff'], index = selected_station.index)

In [13]:
selected_station

Unnamed: 0,STATION,Cluster,latitude,longitude,Centroid_Dist,USAF,CLASS,CZ
0,ARCATA AIRPORT,0,40.983,-124.1,1.9901,725945,1,Arcata
1,DAGGETT BARSTOW-DAGGETT AP,1,34.85,-116.8,4.63014,723815,1,Palmdale
2,LONG BEACH DAUGHERTY FLD,2,33.833,-118.167,1.90124,722970,1,Torrance
3,SACRAMENTO EXECUTIVE ARPT,3,38.5,-121.5,3.01334,724830,1,Sacramento
4,IMPERIAL,4,32.833,-115.583,2.10691,747185,2,Palm Spring-Intl
5,RIVERSIDE MUNI,5,33.95,-117.45,1.14486,722869,2,Burbank-Glendale
6,SOUTH LAKE TAHOE,6,38.9,-120.0,1.03368,725847,2,Blue Canyon
7,SAN FRANCISCO INTL AP,7,37.617,-122.4,3.99773,724940,1,Oakland
8,CHINA LAKE NAF,8,35.683,-117.683,0.74062,746120,2,China Lake
9,REDDING MUNICIPAL ARPT,9,40.517,-122.317,2.85041,725920,1,Santa Rosa


In [14]:
selected_station.to_csv('selected_stations.csv')

In [15]:
list(selected_station.USAF)

[725945,
 723815,
 722970,
 724830,
 747185,
 722869,
 725847,
 724940,
 746120,
 725920,
 723940,
 722900,
 723925,
 723890,
 690150,
 725955]

In [16]:
len(list(selected_station.USAF))

16

In [17]:
import glob

for i in list(selected_station.USAF):
    print "process station ", i
    out = str(i) + '.csv'
    allFiles = glob.glob('nsrdb_solar/' + str(i) + '/*.csv')
    data_all = pd.DataFrame()
    list_ = []
    for file_ in allFiles:
        df = pd.read_csv(file_,index_col=None, header=0)
        list_.append(df)
    data_all = pd.concat(list_)
    data_all.to_csv(out)

process station  725945
process station  723815
process station  722970
process station  724830
process station  747185
process station  722869
process station  725847
process station  724940
process station  746120
process station  725920
process station  723940
process station  722900
process station  723925
process station  723890
process station  690150
process station  725955


In [18]:
selected_station.columns

Index([u'STATION', u'Cluster', u'latitude', u'longitude', u'Centroid_Dist',
       u'USAF', u'CLASS', u'CZ'],
      dtype='object')