In [1]:
import numpy as np
import pandas as pd

In [2]:
# Setup the data paths
hydroportal_files_paths = ['Discharge_3_hour_mean_inflow.csv',
                           'Lake_Height.csv',
                           'PercentFull_Active_Lake_Storage.csv',
                           'Snow_Volume_Opuha_Catchment.csv',
                           'Turbidity_Buoy.csv',
                           'Turbidity_Platform.csv',
                           'Water_Temp_Buoy.csv',
                           'Water_Temp_Platform.csv']

cliflo_files = 'Cliflo_39255_Fairlie_Env_Data.csv'

In [3]:
# Load the data and process the column name etc

dataframe_list = []

for file in hydroportal_files_paths:
    df = pd.read_csv(file, skiprows=1,index_col=0)
    # Make column names unique by appending part of the filename
    df.columns = [file.split('.')[0] + '_' + col for col in df.columns]
    df.index = pd.to_datetime(df.index)
    dataframe_list.append(df)

merged_hydro_portal_data = pd.concat(dataframe_list, axis=1, join='outer')

merged_hydro_portal_data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 782231 entries, 1981-04-02 00:00:00 to 2024-06-10 12:30:00
Data columns (total 8 columns):
 #   Column                                      Non-Null Count   Dtype  
---  ------                                      --------------   -----  
 0   Discharge_3_hour_mean_inflow_Value (m^3/s)  28349 non-null   float64
 1   Lake_Height_Value (m)                       699906 non-null  float64
 2   PercentFull_Active_Lake_Storage_Value (%)   691883 non-null  float64
 3   Snow_Volume_Opuha_Catchment_Value (Mm^3)    15766 non-null   float64
 4   Turbidity_Buoy_Value (NTU)                  217615 non-null  float64
 5   Turbidity_Platform_Value (NTU)              213917 non-null  float64
 6   Water_Temp_Buoy_Value (°C)                  197546 non-null  float64
 7   Water_Temp_Platform_Value (°C)              214817 non-null  float64
dtypes: float64(8)
memory usage: 53.7 MB


In [4]:
merged_hydro_portal_data.head()

Unnamed: 0_level_0,Discharge_3_hour_mean_inflow_Value (m^3/s),Lake_Height_Value (m),PercentFull_Active_Lake_Storage_Value (%),Snow_Volume_Opuha_Catchment_Value (Mm^3),Turbidity_Buoy_Value (NTU),Turbidity_Platform_Value (NTU),Water_Temp_Buoy_Value (°C),Water_Temp_Platform_Value (°C)
Timestamp (UTC+12:00),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-04-02,,,,0.0,,,,
1981-04-03,,,,0.0,,,,
1981-04-04,,,,0.0,,,,
1981-04-05,,,,0.0,,,,
1981-04-06,,,,0.0,,,,


In [5]:
# Rename the columns of the dataframe to make it more readable

new_column_names = [
    'Discharge_(m^3/s)',
    'Lake_Height_(m)',
    'PercentFull_Active_Lake_Storage_(%)',
    'Snow_Volume_Opuha_Catchment_(mm)',
    'Turbidity_Buoy_(NTU)',
    'Turbidity_Platform_(NTU)',
    'Water_Temp_Buoy_(degC)',
    'Water_Temp_Platform_(degC)']

rename_dic = dict(zip(merged_hydro_portal_data.columns, new_column_names))

# Rename the columns of the dataframe

merged_hydro_portal_data.rename(columns=rename_dic, inplace=True)

In [6]:
merged_hydro_portal_data

Unnamed: 0_level_0,Discharge_(m^3/s),Lake_Height_(m),PercentFull_Active_Lake_Storage_(%),Snow_Volume_Opuha_Catchment_(mm),Turbidity_Buoy_(NTU),Turbidity_Platform_(NTU),Water_Temp_Buoy_(degC),Water_Temp_Platform_(degC)
Timestamp (UTC+12:00),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1981-04-02 00:00:00,,,,0.0,,,,
1981-04-03 00:00:00,,,,0.0,,,,
1981-04-04 00:00:00,,,,0.0,,,,
1981-04-05 00:00:00,,,,0.0,,,,
1981-04-06 00:00:00,,,,0.0,,,,
...,...,...,...,...,...,...,...,...
2024-06-10 12:00:00,0.170833,376.7,13.815221,,2.06,3.79,6.48,6.47
2024-06-10 12:05:00,,376.7,13.815221,,,,,
2024-06-10 12:10:00,,376.7,13.815221,,,,,
2024-06-10 12:15:00,,376.7,13.815221,,2.26,4.24,6.48,6.46


In [7]:
# Rading the cliflo data

cliflo_data = pd.read_csv(cliflo_files,skiprows=10)

# drop the last few rows
cliflo_data=cliflo_data[:-6]

# Drop the first column
cliflo_data = cliflo_data.drop(cliflo_data.columns[0], axis=1)

In [8]:
# Convert 'Day(Local_Date)' to datetime and set it as the index
cliflo_data['Day(Local_Date)'] = pd.to_datetime(cliflo_data['Day(Local_Date)'], format='%Y%m%d:%H%M')
cliflo_data.set_index('Day(Local_Date)', inplace=True)
# Rename the index
cliflo_data.index.rename('Timestamp (UTC+12:00)', inplace=True)



In [9]:
# Merge the two dataframes

Raw_Ground_Data = pd.concat([merged_hydro_portal_data, cliflo_data], axis=1, join='outer')

In [10]:
Raw_Ground_Data.to_csv('Raw_Ground_Data.csv')

In [28]:
Raw_Ground_Data

Unnamed: 0_level_0,Discharge_(m^3/s),Lake_Height_(m),PercentFull_Active_Lake_Storage_(%),Snow_Volume_Opuha_Catchment_(mm),Turbidity_Buoy_(NTU),Turbidity_Platform_(NTU),Water_Temp_Buoy_(degC),Water_Temp_Platform_(degC),WDir(Deg),WSpd(m/s),...,Tgmin_Rel,ET05_Rel,ET10_Rel,ET20_Rel,ET30_Rel,ET100_Rel,Pmsl_Rel,Pstn_Rel,Sun_Rel,Rad_Rel
Timestamp (UTC+12:00),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1981-04-02 00:00:00,,,,0.0,,,,,,,...,,,,,,,,,,
1981-04-03 00:00:00,,,,0.0,,,,,,,...,,,,,,,,,,
1981-04-04 00:00:00,,,,0.0,,,,,,,...,,,,,,,,,,
1981-04-05 00:00:00,,,,0.0,,,,,,,...,,,,,,,,,,
1981-04-06 00:00:00,,,,0.0,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-06-10 12:00:00,0.170833,376.7,13.815221,,2.06,3.79,6.48,6.47,,,...,,,,,,,,,,
2024-06-10 12:05:00,,376.7,13.815221,,,,,,,,...,,,,,,,,,,
2024-06-10 12:10:00,,376.7,13.815221,,,,,,,,...,,,,,,,,,,
2024-06-10 12:15:00,,376.7,13.815221,,2.26,4.24,6.48,6.46,,,...,,,,,,,,,,


# Ground data clean up

In [11]:
## Purpose is to clean up the columns that only contains '-' 
Raw_Ground_Data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 782231 entries, 1981-04-02 00:00:00 to 2024-06-10 12:30:00
Data columns (total 47 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Discharge_(m^3/s)                    28349 non-null   float64
 1   Lake_Height_(m)                      699906 non-null  float64
 2   PercentFull_Active_Lake_Storage_(%)  691883 non-null  float64
 3   Snow_Volume_Opuha_Catchment_(mm)     15766 non-null   float64
 4   Turbidity_Buoy_(NTU)                 217615 non-null  float64
 5   Turbidity_Platform_(NTU)             213917 non-null  float64
 6   Water_Temp_Buoy_(degC)               197546 non-null  float64
 7   Water_Temp_Platform_(degC)           214817 non-null  float64
 8   WDir(Deg)                            2707 non-null    object 
 9   WSpd(m/s)                            2707 non-null    object 
 10  GustDir(Deg)                         2707 non-

In [12]:
# Get unique values in a column
for col in Raw_Ground_Data.columns:
    unique_values = Raw_Ground_Data[col].value_counts()
    print(col, unique_values)

Discharge_(m^3/s) Discharge_(m^3/s)
 0.000000    764
 1.000000    253
 9.505572     23
 9.505572     22
 7.300000     22
            ... 
 1.303593      1
 1.662160      1
 1.437162      1
-0.303540      1
 7.661111      1
Name: count, Length: 25310, dtype: int64
Lake_Height_(m) Lake_Height_(m)
371.2000    1500
371.4000    1480
389.9500    1415
389.9680    1140
371.3000    1097
            ... 
370.6380       1
370.6330       1
388.4745       1
370.6250       1
375.9370       1
Name: count, Length: 22223, dtype: int64
PercentFull_Active_Lake_Storage_(%) PercentFull_Active_Lake_Storage_(%)
1.709687     1497
2.018780     1478
88.708004    1414
88.864661    1132
1.863269     1094
             ... 
13.028279       1
13.039960       1
13.107125       1
13.101285       1
6.922409        1
Name: count, Length: 30937, dtype: int64
Snow_Volume_Opuha_Catchment_(mm) Snow_Volume_Opuha_Catchment_(mm)
0.0      2619
0.1       512
0.2       273
0.3       224
0.4       142
         ... 
268.0       1
2

In [13]:
# Get unique values in a column
for col in Raw_Ground_Data.columns:
    unique_values = Raw_Ground_Data[col].dropna().unique()
    # If all non-NaN unique values in the column are '-' or '*', drop it
    if set(unique_values).issubset({'-', '*'}):
        Raw_Ground_Data = Raw_Ground_Data.drop(col, axis=1)

In [14]:
Raw_Ground_Data.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 782231 entries, 1981-04-02 00:00:00 to 2024-06-10 12:30:00
Data columns (total 21 columns):
 #   Column                               Non-Null Count   Dtype  
---  ------                               --------------   -----  
 0   Discharge_(m^3/s)                    28349 non-null   float64
 1   Lake_Height_(m)                      699906 non-null  float64
 2   PercentFull_Active_Lake_Storage_(%)  691883 non-null  float64
 3   Snow_Volume_Opuha_Catchment_(mm)     15766 non-null   float64
 4   Turbidity_Buoy_(NTU)                 217615 non-null  float64
 5   Turbidity_Platform_(NTU)             213917 non-null  float64
 6   Water_Temp_Buoy_(degC)               197546 non-null  float64
 7   Water_Temp_Platform_(degC)           214817 non-null  float64
 8   WDir(Deg)                            2707 non-null    object 
 9   WSpd(m/s)                            2707 non-null    object 
 10  GustDir(Deg)                         2707 non-

In [15]:
Raw_Ground_Data.to_csv('Ground_Data.csv')

In [34]:
Raw_Ground_Data.isna().sum()

Discharge_(m^3/s)                      753882
Lake_Height_(m)                         82325
PercentFull_Active_Lake_Storage_(%)     90348
Snow_Volume_Opuha_Catchment_(mm)       766465
Turbidity_Buoy_(NTU)                   564616
Turbidity_Platform_(NTU)               568314
Water_Temp_Buoy_(degC)                 584685
Water_Temp_Platform_(degC)             567414
WDir(Deg)                              779524
WSpd(m/s)                              779524
GustDir(Deg)                           779524
GustSpd(m/s)                           779524
WindRun(Km)                            779524
Rain(mm)                               779524
Tdry(C)                                779524
TWet(C)                                779524
RH(%)                                  779524
Tmax(C)                                779524
Tmin(C)                                779524
Pmsl(hPa)                              779524
Pstn(hPa)                              779524
dtype: int64