In [1]:
# Import measurement data from weatherlink files, modify, and export to new tables for each station

# This page is very useful for time series manipulations:
# https://chrisalbon.com/python/data_wrangling/pandas_time_series_basics/

In [1]:
import pandas as pd
import numpy as np
import os.path
from datetime import datetime

In [2]:
year = '2019'
indata = os.path.join( os.getcwd(), '..', year, 'input_data\\Staten_Island_-_East_Pumps_1-1-19_12-00_AM_1_Year_1580151855_v2.csv' )
indata2 = os.path.join( os.getcwd(), '..', year, 'input_data\\Staten_Island_-_South_Pumps_1-1-19_12-00_AM_1_Year_1580172497_v2.csv' )

outname = os.path.join( os.getcwd(), '..', year, 'DataValues.csv' )

In [3]:
#Get headers (in two rows, starting with the third row)
headers = pd.read_csv(indata, skiprows=2,header=None,nrows=3)
headers

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,,Siphon 10,Siphon 36,Siphon 37,Siphon 38,Siphon 4,Siphon 5,Siphon 6,Siphon 7 SI,Siphon 7 SI,Siphon 9
1,,InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),InnovaSonic 203 (100 Gallons per Pulse),AG2000 -1200 -11/* -PxH GPM G (100 Gal per Pulse),InnovaSonic 203 (100 Gallons per Pulse)
2,,Port 1,Port 1,Port 1,Port 1,Port 1,Port 1,Port 1,Port 1,Port 2,Port 1


In [4]:
# Remove spaces from headers
headers_row0 = headers.iloc[0].tolist()
k=[]
for i in headers_row0[1:]:
    j = i.replace(' ','')
    k.append(j)
columns = k
columns

['Siphon10',
 'Siphon36',
 'Siphon37',
 'Siphon38',
 'Siphon4',
 'Siphon5',
 'Siphon6',
 'Siphon7SI',
 'Siphon7SI',
 'Siphon9']

In [5]:
# Provide more detail for Siphon 7 since it has two meters
columns.insert(7, 'Siphon7_IS')
columns.insert(8, 'Siphon7_AG')
columns.pop(9)
columns.pop(9)
columns

['Siphon10',
 'Siphon36',
 'Siphon37',
 'Siphon38',
 'Siphon4',
 'Siphon5',
 'Siphon6',
 'Siphon7_IS',
 'Siphon7_AG',
 'Siphon9']

In [6]:
#Import the data and create a date index
df = pd.read_csv(indata,skiprows=6,header=None,index_col=0, na_values="--")
df.index = pd.to_datetime(df.index) # Takes about 15 seconds
df.index.name = "LocalDateTime"
df.columns = columns
df.head()

Unnamed: 0_level_0,Siphon10,Siphon36,Siphon37,Siphon38,Siphon4,Siphon5,Siphon6,Siphon7_IS,Siphon7_AG,Siphon9
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2019-01-01 00:00:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,
2019-01-01 00:05:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,
2019-01-01 00:10:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,
2019-01-01 00:15:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,
2019-01-01 00:20:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,


## Import any additional datafiles

In [7]:
#Get headers (in two rows, starting with the third row)
headers2 = pd.read_csv(indata2, skiprows=2,header=None,nrows=3)
headers2

Unnamed: 0,0,1
0,,Siphon 21
1,,InnovaSonic 203 (100 Gallons per Pulse)
2,,Port 1


In [8]:
#Import the data and create a date index
df2 = pd.read_csv(indata2,skiprows=6,header=None,index_col=0, na_values="--")
df2.index = pd.to_datetime(df2.index) # Takes about 15 seconds
df2.index.name = "LocalDateTime"
df2.columns = ['Siphon21']
df2.head()

Unnamed: 0_level_0,Siphon21
LocalDateTime,Unnamed: 1_level_1
2019-08-15 11:25:00,0
2019-08-15 11:30:00,0
2019-08-15 11:35:00,0
2019-08-15 11:40:00,0
2019-08-15 11:45:00,0


In [9]:
# Combine the dataframes
df = pd.merge(df, df2, on='LocalDateTime', how='left')
df.head()

Unnamed: 0_level_0,Siphon10,Siphon36,Siphon37,Siphon38,Siphon4,Siphon5,Siphon6,Siphon7_IS,Siphon7_AG,Siphon9,Siphon21
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-01-01 00:00:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-01-01 00:05:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-01-01 00:10:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-01-01 00:15:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-01-01 00:20:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,


In [15]:
#Update column list
columns = list(df.columns.values)

## Update dataframe to standard format

In [16]:
# Convert all data to gallons per minute
df = df/5

In [20]:
# OPTIONAL
#Check for daylight savings time issues
#Start 2am, March 10, 2019.  2am is converted to 3am
#End 2am, November 3, 2018. 2am data is missing
df['3-10-2019 01:00': '3-10-2019 04:00']
#df['11-3-2019 01:00': '11-3-2019 04:00']

Unnamed: 0_level_0,Siphon10,Siphon36,Siphon37,Siphon38,Siphon4,Siphon5,Siphon6,Siphon7_IS,Siphon7_AG,Siphon9,Siphon21
LocalDateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2019-03-10 01:00:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:05:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:10:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:15:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:20:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:25:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:30:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:35:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:40:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,
2019-03-10 01:45:00,0.0,0.0,0.0,0.0,,,,0.0,0.0,,


In [21]:
#Create UTC column
df['DateTimeUTC'] = df.index.tz_localize('US/Pacific', ambiguous='NaT').tz_convert('UTC')

In [22]:
#Create UTCOffset column
df['UTCOffset']  = (df['DateTimeUTC'] - df.index.tz_localize('UTC')).dt.seconds/(60*60)*-1

In [23]:
# Create the additional columns
df['QualityControlLevelCode'] = 0 # 0 is for raw data
df['SourceCode'] = "TNC2020"
df['CensorCode'] = "nc"
df['VariableCode'] = "GPM_5min"

In [24]:
# Reset the index to make a local date column
df = df.reset_index()

In [25]:
#Convert wide to long format
id_vars = ['LocalDateTime',
           'UTCOffset',
           'DateTimeUTC',
           'VariableCode',
           'SourceCode',
           'QualityControlLevelCode',
           'CensorCode'
          ]
value_vars = columns

df = pd.melt(df, id_vars=id_vars,value_vars=value_vars)

In [26]:
#Update the MethodCode field
#All sites use the same method except Siphon7_AG_2
df['MethodCode'] = "IS203_EM"
df.loc[df['variable'] == "Siphon7_AG", 'MethodCode'] = "AG3000_EM"

In [27]:
#Update the Site field
df.loc[df['variable'].isin(["Siphon7_AG", "Siphon7_IS"]), 'variable'] = "Siphon7"
df.rename(columns={'variable': 'SiteCode'}, inplace=True)

In [28]:
#Update DataValue field
df.rename(columns={'value': 'DataValue'}, inplace=True)

In [29]:
#Update Qualifier field
#More work needed to screen for outliers and missing values
df['QualifierCode'] = 0

In [30]:
# Remove the localization from the UTC column
df['DateTimeUTC'] = df['DateTimeUTC'].astype(str).str[:-6]

In [31]:
# Save as a new text file
df.to_csv(outname, encoding='utf-8',index=False)