# Working code

In [2]:
import pandas as pd
from datetime import timedelta
pd.set_option('display.max_columns', None)

# Load the data
sentinel_data_path = 'Sentinel2_Data.csv'
ground_data_path = 'Ground_Data.csv'

# Read the CSV files
sentinel_df = pd.read_csv(sentinel_data_path, parse_dates=['timestamp'])
ground_df = pd.read_csv(ground_data_path, parse_dates=['Timestamp (UTC+12:00)'], low_memory=False)

# Rename timestamp columns
sentinel_df.rename(columns={'timestamp': 'timestamp_sentinel2'}, inplace=True)
ground_df.rename(columns={'Timestamp (UTC+12:00)': 'timestamp_ground'}, inplace=True)

# Convert columns to appropriate data types
numeric_cols = [
    'Discharge_(m^3/s)', 'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)',
    'Snow_Volume_Opuha_Catchment_(mm)', 'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)',
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)', 'WDir(Deg)', 'WSpd(m/s)',
    'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)', 'Tdry(C)', 'TWet(C)',
    'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

for col in numeric_cols:
    ground_df[col] = pd.to_numeric(ground_df[col], errors='coerce')

# Separate ground data based on sampling intervals
three_hour_avg_cols = ['Discharge_(m^3/s)']
fifteen_min_avg_cols = [
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)',
    'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)'
]
daily_avg_cols = [
    'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)',
    'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

# Function to find the closest ground measurement to a given Sentinel-2 timestamp and compute time difference
def find_closest_measurement_time(sentinel_time, ground_df, cols, time_window):
    start_time = sentinel_time - time_window
    end_time = sentinel_time + time_window
    filtered_ground_df = ground_df[(ground_df['timestamp_ground'] >= start_time) & (ground_df['timestamp_ground'] <= end_time)]
    if not filtered_ground_df.empty:
        closest_time = filtered_ground_df['timestamp_ground'].iloc[(filtered_ground_df['timestamp_ground'] - sentinel_time).abs().argsort()[:1]].iloc[0]
        time_diff = abs((closest_time - sentinel_time).total_seconds())  # Time difference in seconds
        closest_rows = filtered_ground_df[filtered_ground_df['timestamp_ground'] == closest_time][cols].iloc[0].to_dict()
        return closest_rows, time_diff
    else:
        return {col: None for col in cols}, None

# Define the time window for finer intervals (e.g., 1 hour)
time_window = timedelta(hours=3)

# Prepare a list to store merged data
merged_data = []

# Process each Sentinel-2 timestamp
for index, row in sentinel_df.iterrows():
    sentinel_time = row['timestamp_sentinel2']
    merged_row = row.to_dict()
    
    # Find closest measurements and time difference for three hour average and fifteen minute interval columns
    closest_three_hour, time_diff_three_hour = find_closest_measurement_time(sentinel_time, ground_df, three_hour_avg_cols, time_window)
    closest_fifteen_min, time_diff_fifteen_min = find_closest_measurement_time(sentinel_time, ground_df, fifteen_min_avg_cols, time_window)
    
    # Update the merged row with these measurements
    merged_row.update(closest_three_hour)
    merged_row.update(closest_fifteen_min)
    
    # Add the time difference to the merged row (use the time difference from the finer interval measurements)
    merged_row['Ground_Measurements_time_diff_(seconds)'] = time_diff_fifteen_min if time_diff_fifteen_min is not None else time_diff_three_hour
    
    merged_data.append(merged_row)

# Convert the merged data to a DataFrame
merged_df = pd.DataFrame(merged_data)

# Create date columns for merging daily averages
ground_df['date'] = ground_df['timestamp_ground'].dt.date
sentinel_df['date'] = sentinel_df['timestamp_sentinel2'].dt.date
merged_df['date'] = pd.to_datetime(merged_df['timestamp_sentinel2']).dt.date

# Calculate daily averages for relevant ground data
daily_avg_df = ground_df.groupby('date')[daily_avg_cols].mean().reset_index()

# Merge the daily averages with the Sentinel-2 data
merged_df = pd.merge(merged_df, daily_avg_df, on='date', how='left')

# Drop the 'date' column as it was only for merging purposes
merged_df.drop(columns=['date'], inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('New_Merged_Data.csv', index=False)

print("Merging completed. The merged dataset is saved as 'New_Merged_Data.csv'")


Merging completed. The merged dataset is saved as 'New_Merged_Data.csv'


In [3]:

# Load the merged data
merged_data_path = 'New_Merged_Data.csv'
merged_df = pd.read_csv(merged_data_path)

# List of band prefixes to aggregate
bands = ['B2', 'B3', 'B4', 'B8', 'B8A', 'B11', 'B12']

# Compute averages for each band
for band in bands:
    band_cols = [col for col in merged_df.columns if f'_{band}' in col]
    merged_df[f'{band}_AVG'] = merged_df[band_cols].mean(axis=1)

# Compute averages for cs and cs_CDF
cs_cols = [col for col in merged_df.columns if '_cs' in col and '_cs_cdf' not in col]
cs_cdf_cols = [col for col in merged_df.columns if '_cs_cdf' in col]

merged_df['cs_AVG'] = merged_df[cs_cols].mean(axis=1)
merged_df['cs_cdf_AVG'] = merged_df[cs_cdf_cols].mean(axis=1)

# Save the updated dataframe to a new CSV file
updated_data_path = 'Updated_Merged_Data.csv'
merged_df.to_csv(updated_data_path, index=False)

print("Aggregation completed. The updated dataset is saved as 'Updated_Merged_Data.csv'")


Aggregation completed. The updated dataset is saved as 'Updated_Merged_Data.csv'


In [6]:
merged_df

Unnamed: 0,timestamp_sentinel2,point1_B2,point10_B2,point11_B2,point12_B2,point13_B2,point14_B2,point15_B2,point16_B2,point17_B2,...,Pstn(hPa),B2_AVG,B3_AVG,B4_AVG,B8_AVG,B8A_AVG,B11_AVG,B12_AVG,cs_AVG,cs_cdf_AVG
0,2018-12-14 10:38:15,2068.0,2138.0,2084.0,2288.0,2678.0,2924.0,3312.0,2958.0,2958.0,...,972.1,2527.909091,2646.818182,2544.090909,3888.590909,4010.454545,3389.363636,3013.727273,0.185740,0.230125
1,2018-12-16 10:28:16,8680.0,8400.0,8472.0,8424.0,8576.0,8504.0,8472.0,8400.0,8400.0,...,979.3,8394.545455,7825.454545,7425.636364,7479.363636,7411.090909,5478.409091,4051.136364,0.003922,0.043494
2,2018-12-19 10:38:13,6748.0,6884.0,6860.0,6808.0,6784.0,6732.0,6808.0,6860.0,6860.0,...,961.9,6819.818182,6948.000000,7082.363636,7510.954545,7456.636364,3911.636364,2789.227273,0.011765,0.057041
3,2018-12-21 10:28:20,2464.0,1506.0,1236.0,1282.0,2364.0,2460.0,2360.0,2216.0,2216.0,...,953.9,1979.000000,2141.090909,2041.818182,3568.590909,3731.272727,2775.863636,2461.590909,0.065597,0.169875
4,2018-12-26 10:28:18,11912.0,11776.0,11920.0,11920.0,11944.0,11984.0,11984.0,12080.0,12080.0,...,965.0,11913.090909,10787.636364,10084.363636,9416.772727,9212.454545,5895.136364,4535.409091,0.003922,0.043137
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
749,2024-06-15 10:38:25,467.0,230.0,264.0,287.0,433.0,364.0,360.0,350.0,350.0,...,,318.818182,332.818182,333.454545,558.750000,611.227273,539.500000,467.363636,0.348128,0.626381
750,2024-06-17 10:28:31,2852.0,2836.0,2908.0,2956.0,2880.0,2862.0,2862.0,2886.0,2886.0,...,,2876.909091,2878.727273,2971.090909,3414.340909,3319.772727,1259.045455,1353.500000,0.019964,0.090018
751,2024-06-20 10:38:27,1416.0,1222.0,1372.0,1396.0,1316.0,1360.0,1312.0,1260.0,1260.0,...,,1302.545455,1328.545455,1301.181818,1449.863636,1415.181818,1078.409091,967.500000,0.349554,0.597326
752,2024-06-22 10:28:27,1026.0,966.0,1094.0,1060.0,1086.0,1072.0,1034.0,966.0,966.0,...,,1006.863636,1022.954545,953.272727,1134.159091,1239.272727,1001.545455,967.318182,0.512834,0.785918


# Fix up for the discharge

In [32]:
import pandas as pd
from datetime import timedelta

# Set display options
pd.set_option('display.max_columns', None)

# Load the data
sentinel_data_path = 'Sentinel2_Data.csv'
ground_data_path = 'Ground_Data.csv'

# Read the CSV files
sentinel_df = pd.read_csv(sentinel_data_path, parse_dates=['timestamp'])
ground_df = pd.read_csv(ground_data_path, parse_dates=['Timestamp (UTC+12:00)'], low_memory=False)

# Rename timestamp columns
sentinel_df.rename(columns={'timestamp': 'timestamp_sentinel2'}, inplace=True)
ground_df.rename(columns={'Timestamp (UTC+12:00)': 'timestamp_ground'}, inplace=True)

# Convert columns to appropriate data types
numeric_cols = [
    'Discharge_(m^3/s)', 'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)',
    'Snow_Volume_Opuha_Catchment_(mm)', 'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)',
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)', 'WDir(Deg)', 'WSpd(m/s)',
    'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)', 'Tdry(C)', 'TWet(C)',
    'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

for col in numeric_cols:
    ground_df[col] = pd.to_numeric(ground_df[col], errors='coerce')

# Separate ground data based on sampling intervals
fifteen_min_avg_cols = [
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)',
    'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)'
]
daily_avg_cols = [
    'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)',
    'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

# Function to find the closest ground measurement to a given Sentinel-2 timestamp and compute time difference
def find_closest_measurement_time(sentinel_time, ground_df, cols, time_window):
    start_time = sentinel_time - time_window
    end_time = sentinel_time + time_window
    filtered_ground_df = ground_df[(ground_df['timestamp_ground'] >= start_time) & (ground_df['timestamp_ground'] <= end_time)]
    if not filtered_ground_df.empty:
        closest_time = filtered_ground_df['timestamp_ground'].iloc[(filtered_ground_df['timestamp_ground'] - sentinel_time).abs().argsort()[:1]].iloc[0]
        time_diff = abs((closest_time - sentinel_time).total_seconds())  # Time difference in seconds
        closest_rows = filtered_ground_df[filtered_ground_df['timestamp_ground'] == closest_time][cols].iloc[0].to_dict()
        return closest_rows, time_diff
    else:
        return {col: None for col in cols}, None

# Define the time window for measurements
time_window = timedelta(hours=1)
discharge_time_window = timedelta(hours=12)

# Prepare a list to store merged data
merged_data = []

# Process each Sentinel-2 timestamp
for index, row in sentinel_df.iterrows():
    sentinel_time = row['timestamp_sentinel2']
    merged_row = row.to_dict()
    
    # Find closest measurements and time difference for fifteen minute interval columns (excluding discharge)
    closest_fifteen_min, time_diff_fifteen_min = find_closest_measurement_time(sentinel_time, ground_df, fifteen_min_avg_cols, time_window)
    
    # Update the merged row with these measurements
    merged_row.update(closest_fifteen_min)
    
    # Add the time difference to the merged row (use the time difference from the finer interval measurements)
    merged_row['Ground_Measurements_time_diff_(seconds)'] = time_diff_fifteen_min
    
    # Add the merged row to the list
    merged_data.append(merged_row)

# Convert the merged data to a DataFrame
merged_df = pd.DataFrame(merged_data)

# Find and add the closest discharge measurement within the 12-hour window
for index, row in merged_df.iterrows():
    sentinel_time = row['timestamp_sentinel2']
    closest_discharge, _ = find_closest_measurement_time(sentinel_time, ground_df, ['Discharge_(m^3/s)'], discharge_time_window)
    merged_df.at[index, 'Discharge_(m^3/s)'] = closest_discharge['Discharge_(m^3/s)']

# Create date columns for merging daily averages
ground_df['date'] = ground_df['timestamp_ground'].dt.date
sentinel_df['date'] = sentinel_df['timestamp_sentinel2'].dt.date
merged_df['date'] = pd.to_datetime(merged_df['timestamp_sentinel2']).dt.date

# Calculate daily averages for relevant ground data
daily_avg_df = ground_df.groupby('date')[daily_avg_cols].mean().reset_index()

# Merge the daily averages with the Sentinel-2 data
merged_df = pd.merge(merged_df, daily_avg_df, on='date', how='left')

# Drop the 'date' column as it was only for merging purposes
merged_df.drop(columns=['date'], inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('Updated_Merged_Data_27_3.csv', index=False)

print("Merging completed. The merged dataset is saved as 'Updated_Merged_Data.csv'")


Merging completed. The merged dataset is saved as 'Updated_Merged_Data.csv'
