In [None]:
# Import measurement data from weatherlink files, modify, and export to new tables for each station

# This page is very useful for time series manipulations:
# https://chrisalbon.com/python/data_wrangling/pandas_time_series_basics/

In [35]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import plotly.figure_factory as ff
import plotly.graph_objs as go
import os.path
from datetime import datetime

In [117]:
insample = os.path.join( os.getcwd(), '..', 'samples\\IS203_EM.csv' )
indata = os.path.join( os.getcwd(), '..', 'input_data\\Staten_Island_-_East_Pumps_1-1-18_12-00_AM_1_Year_1546631222.csv' )
outname = os.path.join( os.getcwd(), '..', 'DataValues.csv' )

In [92]:
#Get headers (in two rows, starting with the third row)
headers = pd.read_csv(indata, skiprows=2,header=None,nrows=3)
headers

Unnamed: 0,0,1,2,3,4,5,6,7,8
0,,Siphon 7 SI,Siphon 7 SI,Siphon 7 SI,Siphon 30,Siphon 39,Siphon 40,Siphon 11,Siphon 15
1,,InnovaSonic 203 (100 Gallons per Pulse),AG2000 -1200 -11/* -PxH GPM G (100 Gal per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse)
2,,Port 1,Port 2,Port 3,Port 1,Port 1,Port 1,Port 1,Port 1


In [93]:
columns = ["Siphon7_IS_1", "Siphon7_AG_2", "Siphon7_IS_3"]
headers_row0 = headers.iloc[0].tolist()
k=[]
for i in headers_row0[1:]:
    j = i.replace(' ','')
    k.append(j)
columns.extend(k[3:])
columns

['Siphon7_IS_1',
 'Siphon7_AG_2',
 'Siphon7_IS_3',
 'Siphon30',
 'Siphon39',
 'Siphon40',
 'Siphon11',
 'Siphon15']

In [94]:
#Import the data and create a date index
df = pd.read_csv(indata,skiprows=6,header=None,index_col=0, na_values="--")
df.index = pd.to_datetime(df.index) # Takes about 15 seconds
df.index.name = "LocalDateTime"
df.columns = columns
df.head()

Unnamed: 0_level_0,Siphon7_IS_1,Siphon7_AG_2,Siphon7_IS_3,Siphon30,Siphon39,Siphon40,Siphon11,Siphon15
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2018-01-01 00:00:00,0.0,0.0,,,,,,
2018-01-01 00:05:00,0.0,0.0,,,,,,
2018-01-01 00:10:00,0.0,0.0,,,,,,
2018-01-01 00:15:00,0.0,0.0,,,,,,
2018-01-01 00:20:00,0.0,0.0,,,,,,


In [95]:
# Convert all data to gallons per minute
df = df/5

In [42]:
# Check for daylight savings time issues
#Start 2am, March 11, 2018.  2am is converted to 3am
#End 2am, November 4, 2018. 2am data is missing
df['3-11-2018 01:00': '3-11-2018 04:00']
#df['11-4-2018 01:00': '11-4-2018 04:00']

Unnamed: 0_level_0,Siphon7_IS_1,Siphon7_AG_2,Siphon7_IS_3,Siphon30,Siphon39,Siphon40,Siphon11,Siphon15,DateTimeUTC
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2018-03-11 01:00:00,0.0,0.0,0.0,,,,,,2018-03-11 09:00:00+00:00
2018-03-11 01:05:00,0.0,0.0,0.0,,,,,,2018-03-11 09:05:00+00:00
2018-03-11 01:10:00,0.0,0.0,0.0,,,,,,2018-03-11 09:10:00+00:00
2018-03-11 01:15:00,0.0,0.0,0.0,,,,,,2018-03-11 09:15:00+00:00
2018-03-11 01:20:00,0.0,0.0,0.0,,,,,,2018-03-11 09:20:00+00:00
2018-03-11 01:25:00,0.0,0.0,0.0,,,,,,2018-03-11 09:25:00+00:00
2018-03-11 01:30:00,0.0,0.0,0.0,,,,,,2018-03-11 09:30:00+00:00
2018-03-11 01:35:00,0.0,0.0,0.0,,,,,,2018-03-11 09:35:00+00:00
2018-03-11 01:40:00,0.0,0.0,0.0,,,,,,2018-03-11 09:40:00+00:00
2018-03-11 01:45:00,0.0,0.0,0.0,,,,,,2018-03-11 09:45:00+00:00


In [96]:
#Create UTC column
df['DateTimeUTC'] = df.index.tz_localize('US/Pacific', ambiguous='NaT').tz_convert('UTC')

In [97]:
#Create UTCOffset column
df['UTCOffset']  = (df['DateTimeUTC'] - df.index.tz_localize('UTC')).dt.seconds/(60*60)*-1

In [98]:
# Create the additional columns
df['QualityControlLevelCode'] = 0 # 0 is for raw data
df['SourceCode'] = "TNC"
df['CensorCode'] = "nc"
df['VariableCode'] = "GPM_5min"

In [99]:
# Reset the index to make a local date column
df = df.reset_index()

In [100]:
# Drop 'Siphon7_IS_3' because it is the same as 'Siphon7_IS_1'
df = df.drop(['Siphon7_IS_3'], axis=1)


In [101]:
#Convert wide to long format
id_vars = ['LocalDateTime',
           'UTCOffset',
           'DateTimeUTC',
           'VariableCode',
           'SourceCode',
           'QualityControlLevelCode',
           'CensorCode'
          ]
value_vars = ['Siphon7_IS_1',
              'Siphon7_AG_2',
              'Siphon30',
              'Siphon39',
              'Siphon40',
              'Siphon11',
              'Siphon15']

df = pd.melt(df, id_vars=id_vars,value_vars=value_vars)

In [104]:
#Update the MethodCode field
df['MethodCode'] = "IS203_EM"
df.loc[df['variable'] == "Siphon7_AG_2", 'MethodCode'] = "AG3000_EM"

In [107]:
#Update the Site field
df.loc[df['variable'].isin(["Siphon7_AG_2", "Siphon7_IS_1"]), 'variable'] = "Siphon7"
df.rename(columns={'variable': 'SiteCode'}, inplace=True)

In [109]:
#Update DataValue field
df.rename(columns={'value': 'DataValue'}, inplace=True)

In [118]:
#Update Qualifier field
#More work needed to screen for outliers and missing values
df['QualifierCode'] = 0

In [110]:
df.index = df.LocalDateTime

In [114]:
month = '7-2018'
site = 'Siphon7'
method = 'IS203_EM'

df[month].loc[(df[month]['SiteCode'] == site) & (df[month]['MethodCode'] == method)]

Unnamed: 0_level_0,LocalDateTime,UTCOffset,DateTimeUTC,VariableCode,SourceCode,QualityControlLevelCode,CensorCode,SiteCode,DataValue,MethodCode
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2018-07-01 00:00:00,2018-07-01 00:00:00,-7.0,2018-07-01 07:00:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM
2018-07-01 00:05:00,2018-07-01 00:05:00,-7.0,2018-07-01 07:05:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1900.0,IS203_EM
2018-07-01 00:10:00,2018-07-01 00:10:00,-7.0,2018-07-01 07:10:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM
2018-07-01 00:15:00,2018-07-01 00:15:00,-7.0,2018-07-01 07:15:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM
2018-07-01 00:20:00,2018-07-01 00:20:00,-7.0,2018-07-01 07:20:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1860.0,IS203_EM
2018-07-01 00:25:00,2018-07-01 00:25:00,-7.0,2018-07-01 07:25:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM
2018-07-01 00:30:00,2018-07-01 00:30:00,-7.0,2018-07-01 07:30:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1860.0,IS203_EM
2018-07-01 00:35:00,2018-07-01 00:35:00,-7.0,2018-07-01 07:35:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM
2018-07-01 00:40:00,2018-07-01 00:40:00,-7.0,2018-07-01 07:40:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1860.0,IS203_EM
2018-07-01 00:45:00,2018-07-01 00:45:00,-7.0,2018-07-01 07:45:00+00:00,GPM_5min,TNC,0,nc,Siphon7,1880.0,IS203_EM


In [116]:
month = '7-2018'
site = 'Siphon7'
method = 'IS203_EM'
dfchart = df[month].loc[(df[month]['SiteCode'] == site) & 
                        (df[month]['MethodCode'] == method)
                       ]

trace = trace = go.Scatter(
    x = dfchart.LocalDateTime,
    y = dfchart.DataValue,
    mode = "lines+markers" 
)
data = [trace]
py.iplot(data,filename="jupyter-basic_line")

In [119]:
# Save as a new text file
df.to_csv(outname, encoding='utf-8',index=False)