In [None]:
import os
import shutil

# Set the path to the folder containing your CSV files
source_folder = "gcoos_platform_water_temperature_csvs"


# Loop through each file in the source folder
for filename in os.listdir(source_folder):
    # Check if the file is a CSV file and if it contains a year between 1995 and 2025
    if filename.endswith('.csv'):
        # Extract the year from the filename if it appears in YYYY format
        for year in range(1995, 2025):
            if str(year) in filename:
                # Create a folder for the year if it doesn't already exist
                year_folder = os.path.join(source_folder, str(year))
                if not os.path.exists(year_folder):
                    os.makedirs(year_folder)
                
                # Move the file into the appropriate year folder
                source_path = os.path.join(source_folder, filename)
                destination_path = os.path.join(year_folder, filename)
                shutil.move(source_path, destination_path)
                print(f"Moved {filename} to {year_folder}")
                break  # Move to the next file after finding the year

print("Files have been organized by year.")


In [None]:
# Function to organize files by sensor and month within each yearly folder
def organize_files(source_folder):
    # Loop through each year folder in the source directory
    for year_folder in os.listdir(source_folder):
        year_path = os.path.join(source_folder, year_folder)
        
        # Check if the item is a folder and named as a year (numeric check)
        if os.path.isdir(year_path) and year_folder.isdigit():
            # Process each file in the yearly folder
            for file_name in os.listdir(year_path):
                # Ensure we're only processing CSV files
                if file_name.endswith('.csv'):
                    # Extract sensor name and month from the file name
                    try:
                        # Extract sensor name between 'station-' and '_YYYY'
                        sensor_start = file_name.index('station-') + len('station-')
                        sensor_end = file_name.index('_', sensor_start)
                        sensor_name = file_name[sensor_start:sensor_end]
                        
                        # Extract month between 'YYYY_' and '_sea'
                        month_start = file_name.index('_', sensor_end) + 1
                        month_end = file_name.index('_sea', month_start)
                        month = file_name[month_start:month_end]
                        
                        # Define the destination folder path
                        sensor_folder_path = os.path.join(year_path, sensor_name, month)
                        
                        # Create the sensor/month folder if it doesn't exist
                        os.makedirs(sensor_folder_path, exist_ok=True)
                        
                        # Move the file to the appropriate sensor/month folder
                        src_file = os.path.join(year_path, file_name)
                        dst_file = os.path.join(sensor_folder_path, file_name)
                        shutil.move(src_file, dst_file)
                    
                    except ValueError:
                        print(f"Skipping file due to unexpected format: {file_name}")

# Call the function to organize the files
organize_files(source_folder)


In [None]:
# Loop through each year folder in the source folder
for year_folder in os.listdir(source_folder):
    year_path = os.path.join(source_folder, year_folder)
    if os.path.isdir(year_path):  # Ensure it's a directory

        # Loop through each sensor folder within the year folder
        for sensor_folder in os.listdir(year_path):
            sensor_path = os.path.join(year_path, sensor_folder)
            if os.path.isdir(sensor_path):  # Ensure it's a directory

                # Create a new folder for each sensor in the destination path
                sensor_destination_folder = os.path.join(source_folder, sensor_folder)
                os.makedirs(sensor_destination_folder, exist_ok=True)
                
                # Loop through each subfolder (if any) in the sensor folder to get the CSV files
                for subfolder in os.listdir(sensor_path):
                    subfolder_path = os.path.join(sensor_path, subfolder)
                    
                    # If there’s an extra layer of folders, look inside it
                    if os.path.isdir(subfolder_path):
                        for filename in os.listdir(subfolder_path):
                            if filename.endswith('.csv'):
                                source_file_path = os.path.join(subfolder_path, filename)
                                # Build a new filename to avoid overwriting (include the year in the filename)
                                new_filename = f"{year_folder}_{filename}"
                                destination_file_path = os.path.join(sensor_destination_folder, new_filename)
                                
                                # Move the file to the new sensor-based folder
                                shutil.move(source_file_path, destination_file_path)
                                print(f"Moved {filename} from {year_folder}/{sensor_folder} to {sensor_folder} folder")
                    # If there’s no extra layer and the files are directly in the sensor folder
                    elif subfolder.endswith('.csv'):
                        source_file_path = os.path.join(sensor_path, subfolder)
                        new_filename = f"{year_folder}_{subfolder}"
                        destination_file_path = os.path.join(sensor_destination_folder, new_filename)
                        shutil.move(source_file_path, destination_file_path)
                        print(f"Moved {subfolder} from {year_folder}/{sensor_folder} to {sensor_folder} folder")

print("All files have been reorganized by sensor.")


In [None]:
# Set the path to the folder containing your CSV files (used as both source and destination)
source_folder = "gcoos_platform_water_temperature_csvs"

def remove_empty_folders(path):
    # Recursively remove all empty folders in the given path
    for root, dirs, files in os.walk(path, topdown=False):
        for directory in dirs:
            dir_path = os.path.join(root, directory)
            print(f"Checking folder: {dir_path}")  # Debug statement to see each folder being checked
            # Only remove if the folder is genuinely empty
            if not os.listdir(dir_path):
                os.rmdir(dir_path)
                print(f"Removed empty folder: {dir_path}")

# Call the function to check and remove empty folders
remove_empty_folders(source_folder)
print("Finished checking folders.")


In [None]:
import pandas as pd

# Loop through each CSV file in the source folder
for filename in os.listdir(source_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(source_folder, filename)
        
        # Read the CSV file
        df = pd.read_csv(file_path)
        
        # Convert the 'date' column to datetime format, using the specific format string
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
        
        # Drop rows with NaT in the 'date' column if any parsing issues occurred
        df.dropna(subset=['date'], inplace=True)
        
        # Save the file back to the same location
        df.to_csv(file_path, index=False)
        print(f"Converted 'date' column to datetime format in {filename}")

print("All date columns in the source folder have been converted to datetime format.")


In [None]:
# Loop through each sensor folder
for sensor_folder in os.listdir(source_folder):
    sensor_path = os.path.join(source_folder, sensor_folder)
    if os.path.isdir(sensor_path):  # Check if it's a directory

        # List to hold dataframes for each month's CSV
        sensor_data = []

        # Loop through each CSV file within the sensor folder
        for filename in os.listdir(sensor_path):
            if filename.endswith('.csv'):
                file_path = os.path.join(sensor_path, filename)
                
                # Read the CSV and append to the list
                df = pd.read_csv(file_path)
                sensor_data.append(df)

        # Concatenate all monthly data into a single DataFrame
        if sensor_data:
            combined_df = pd.concat(sensor_data, ignore_index=True)
            
            # Ensure data is sorted by date
            combined_df.sort_values(by='date', inplace=True)
            
            # Define the output path and save the combined data for this sensor
            output_file_path = os.path.join(source_folder, f"{sensor_folder}_combined.csv")
            combined_df.to_csv(output_file_path, index=False)
            print(f"Combined data for {sensor_folder} saved to {output_file_path}")

print("All sensor data has been combined into continuous time series files.")


In [None]:
def delete_subfolders_and_csvs(path):
    # Traverse through the directory, starting with subdirectories
    for root, dirs, files in os.walk(path):
        for file in files:
            # Skip CSV files in the root directory
            if root == path and file.endswith('.csv'):
                print(f"Keeping {file} in the root directory.")
            else:
                # Delete any CSV file not in the root directory
                if file.endswith('.csv'):
                    file_path = os.path.join(root, file)
                    os.remove(file_path)
                    print(f"Deleted {file_path}")

    # Now, remove all subdirectories
    for root, dirs, files in os.walk(path, topdown=False):
        for directory in dirs:
            dir_path = os.path.join(root, directory)
            shutil.rmtree(dir_path)
            print(f"Deleted folder: {dir_path}")

# Run the function
delete_subfolders_and_csvs(source_folder)
print("All subfolders and non-root CSV files have been deleted.")


In [None]:
# Function to clean a single CSV file
def clean_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)
    issues_found = False
    
    # Check and handle duplicate 'qcFlag' columns
    if 'qcFlag.1' in df.columns:
        # Fill missing values in 'qcFlag' with values from 'qcFlag.1'
        df['qcFlag'].fillna(df['qcFlag.1'], inplace=True)
        # Drop the duplicate column
        df.drop(columns=['qcFlag.1'], inplace=True)
        issues_found = True

    # Ensure values are correctly aligned for 'network' and 'platform'
    # Shift 'owner' values to 'network' if 'network' is missing or has NaN values
    if 'network' in df.columns and 'owner' in df.columns:
        df['network'] = df['network'].combine_first(df['owner'])
        df.drop(columns=['owner'], inplace=True)
    elif 'owner' in df.columns:
        # If 'network' is missing, create it from 'owner'
        df['network'] = df['owner']
        df.drop(columns=['owner'], inplace=True)
        issues_found = True

    # Shift 'sensor' values to 'platform' if 'platform' is missing or has NaN values
    if 'platform' in df.columns and 'sensor' in df.columns:
        df['platform'] = df['platform'].combine_first(df['sensor'])
        df.drop(columns=['sensor'], inplace=True)
    elif 'sensor' in df.columns:
        # If 'platform' is missing, create it from 'sensor'
        df['platform'] = df['sensor']
        df.drop(columns=['sensor'], inplace=True)
        issues_found = True


    # Save the cleaned file
    df.to_csv(file_path, index=False)

# Loop through each CSV file in the folder and clean it
for filename in os.listdir(source_folder):
    if filename.endswith('.csv'):
        file_path = os.path.join(source_folder, filename)
        clean_csv(file_path)



print("All files have been cleaned.")


In [None]:
import warnings


# Function to clean sea_water_temperature column
def clean_sea_water_temperature(source_folder):
    # Iterate over each CSV file in the base folder
    for csv_file in os.listdir(source_folder):
        if csv_file.endswith(".csv"):
            csv_path = os.path.join(source_folder, csv_file)
            try:
                # Load CSV and catch dtype warnings
                with warnings.catch_warnings(record=True) as w:
                    warnings.simplefilter("always", pd.errors.DtypeWarning)
                    df = pd.read_csv(csv_path)
                    
                    # Check if any warnings were raised
                    if any(isinstance(warn.message, pd.errors.DtypeWarning) for warn in w):
                        print(f"DtypeWarning in file: {csv_file}")
                
                # Check if 'sea_water_temperature' column exists
                if 'sea_water_temperature' in df.columns:
                    # Remove rows with NaN values in the sea_water_temperature column
                    df = df.dropna(subset=['sea_water_temperature'])
                    
                    # Remove rows with sea_water_temperature < 10 or > 50
                    df = df[(df['sea_water_temperature'] >= 10) & (df['sea_water_temperature'] <= 50)]
                    
                    # Save the cleaned data back to the CSV
                    df.to_csv(csv_path, index=False)
                else:
                    print(f"Warning: {csv_file} does not contain a 'sea_water_temperature' column.")
            
            except Exception as e:
                print(f"Error processing file {csv_file}: {e}")

# Execute the function
clean_sea_water_temperature(source_folder)
