# Working code

In [1]:
import pandas as pd
from datetime import timedelta

# Load the data
sentinel_data_path = 'Sentinel2_Data.csv'
ground_data_path = 'Ground_Data.csv'

# Read the CSV files
sentinel_df = pd.read_csv(sentinel_data_path, parse_dates=['timestamp'])
ground_df = pd.read_csv(ground_data_path, parse_dates=['Timestamp (UTC+12:00)'], low_memory=False)

# Rename timestamp columns
sentinel_df.rename(columns={'timestamp': 'timestamp_sentinel2'}, inplace=True)
ground_df.rename(columns={'Timestamp (UTC+12:00)': 'timestamp_ground'}, inplace=True)

# Convert columns to appropriate data types
numeric_cols = [
    'Discharge_(m^3/s)', 'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)',
    'Snow_Volume_Opuha_Catchment_(mm)', 'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)',
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)', 'WDir(Deg)', 'WSpd(m/s)',
    'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)', 'Tdry(C)', 'TWet(C)',
    'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

for col in numeric_cols:
    ground_df[col] = pd.to_numeric(ground_df[col], errors='coerce')

# Separate ground data based on sampling intervals
three_hour_avg_cols = ['Discharge_(m^3/s)']
fifteen_min_avg_cols = [
    'Water_Temp_Buoy_(degC)', 'Water_Temp_Platform_(degC)',
    'Turbidity_Buoy_(NTU)', 'Turbidity_Platform_(NTU)'
]
daily_avg_cols = [
    'Lake_Height_(m)', 'PercentFull_Active_Lake_Storage_(%)', 'Snow_Volume_Opuha_Catchment_(mm)',
    'WDir(Deg)', 'WSpd(m/s)', 'GustDir(Deg)', 'GustSpd(m/s)', 'WindRun(Km)', 'Rain(mm)',
    'Tdry(C)', 'TWet(C)', 'RH(%)', 'Tmax(C)', 'Tmin(C)', 'Pmsl(hPa)', 'Pstn(hPa)'
]

# Function to find the closest ground measurement to a given Sentinel-2 timestamp and compute time difference
def find_closest_measurement_time(sentinel_time, ground_df, cols, time_window):
    start_time = sentinel_time - time_window
    end_time = sentinel_time + time_window
    filtered_ground_df = ground_df[(ground_df['timestamp_ground'] >= start_time) & (ground_df['timestamp_ground'] <= end_time)]
    if not filtered_ground_df.empty:
        closest_time = filtered_ground_df['timestamp_ground'].iloc[(filtered_ground_df['timestamp_ground'] - sentinel_time).abs().argsort()[:1]].iloc[0]
        time_diff = abs((closest_time - sentinel_time).total_seconds())  # Time difference in seconds
        closest_rows = filtered_ground_df[filtered_ground_df['timestamp_ground'] == closest_time][cols].iloc[0].to_dict()
        return closest_rows, time_diff
    else:
        return {col: None for col in cols}, None

# Define the time window for finer intervals (e.g., 1 hour)
time_window = timedelta(hours=1)

# Prepare a list to store merged data
merged_data = []

# Process each Sentinel-2 timestamp
for index, row in sentinel_df.iterrows():
    sentinel_time = row['timestamp_sentinel2']
    merged_row = row.to_dict()
    
    # Find closest measurements and time difference for three hour average and fifteen minute interval columns
    closest_three_hour, time_diff_three_hour = find_closest_measurement_time(sentinel_time, ground_df, three_hour_avg_cols, time_window)
    closest_fifteen_min, time_diff_fifteen_min = find_closest_measurement_time(sentinel_time, ground_df, fifteen_min_avg_cols, time_window)
    
    # Update the merged row with these measurements
    merged_row.update(closest_three_hour)
    merged_row.update(closest_fifteen_min)
    
    # Add the time difference to the merged row (use the time difference from the finer interval measurements)
    merged_row['Ground_Measurements_time_diff_(seconds)'] = time_diff_fifteen_min if time_diff_fifteen_min is not None else time_diff_three_hour
    
    merged_data.append(merged_row)

# Convert the merged data to a DataFrame
merged_df = pd.DataFrame(merged_data)

# Create date columns for merging daily averages
ground_df['date'] = ground_df['timestamp_ground'].dt.date
sentinel_df['date'] = sentinel_df['timestamp_sentinel2'].dt.date
merged_df['date'] = pd.to_datetime(merged_df['timestamp_sentinel2']).dt.date

# Calculate daily averages for relevant ground data
daily_avg_df = ground_df.groupby('date')[daily_avg_cols].mean().reset_index()

# Merge the daily averages with the Sentinel-2 data
merged_df = pd.merge(merged_df, daily_avg_df, on='date', how='left')

# Drop the 'date' column as it was only for merging purposes
merged_df.drop(columns=['date'], inplace=True)

# Save the merged dataframe to a CSV file
merged_df.to_csv('New_Merged_Data.csv', index=False)

print("Merging completed. The merged dataset is saved as 'New_Merged_Data.csv'")


Merging completed. The merged dataset is saved as 'New_Merged_Data.csv'


In [2]:

# Load the merged data
merged_data_path = 'New_Merged_Data.csv'
merged_df = pd.read_csv(merged_data_path)

# List of band prefixes to aggregate
bands = ['B2', 'B3', 'B4', 'B8', 'B8A', 'B11', 'B12']

# Compute averages for each band
for band in bands:
    band_cols = [col for col in merged_df.columns if f'_{band}' in col]
    merged_df[f'{band}_AVG'] = merged_df[band_cols].mean(axis=1)

# Compute averages for cs and cs_CDF
cs_cols = [col for col in merged_df.columns if '_cs' in col and '_cs_cdf' not in col]
cs_cdf_cols = [col for col in merged_df.columns if '_cs_cdf' in col]

merged_df['cs_AVG'] = merged_df[cs_cols].mean(axis=1)
merged_df['cs_cdf_AVG'] = merged_df[cs_cdf_cols].mean(axis=1)

# Save the updated dataframe to a new CSV file
updated_data_path = 'Updated_Merged_Data.csv'
merged_df.to_csv(updated_data_path, index=False)

print("Aggregation completed. The updated dataset is saved as 'Updated_Merged_Data.csv'")


Aggregation completed. The updated dataset is saved as 'Updated_Merged_Data.csv'
