# Generation of Synthetic Waterquality Data
LLMs (ChatGPT) have been used in this code to overcome some roadblocks.

### Install Libraries

In [12]:
#!pip install numpy
#!pip install pandas

### Import Libraries

In [1]:
import numpy as np
import pandas as pd

## Define Sample Sites

In [2]:
locations = {
    'Sanibel Island': 'mine_leak',
    'Boca Grande': 'mine_leak',
    'Venice': 'mine_leak',
    'Siesta Key': 'none',
    'St. Pete Beach': 'sewage_spill',
    'Indian Shores': 'sewage_spill',
    'Clearwater Beach': 'sewage_spill'
}

#### Events
* **mine_leak:** A phosphate mine leaked waste water into the Gulf of Mexico in Florida
* **sewage_spill:** After a hurricane, sewage water spilled into Florida's Gulf of Mexico

## Define Water Quality Parameters

In [3]:
parameters = {
    'Water Temperature': (26, 2, 22, 32, 0, 0),
    'pH': (8.1, 0.3, 7.5, 9.5, 0, 0),
    'Salinity': (32, 3, 20, 37, 1, 2),
    'Dissolved Oxygen': (6.5, 1, 3.0, 9.0, 1, 3),
    'Nitrate': (0.05, 0.02, 0.01, 0.1, 0.01, 0.03),
    'Phosphate': (0.03, 0.01, 0.01, 0.08, 0.01, 0.03),
    'Silicate': (2.5, 0.5, 1.0, 3.5, 0.5, 1),
    'Chlorophyll-a': (3.0, 2.0, 0.5, 15.0, 0.5, 1),
    'Karenia brevis': (1000, 5000, 0, 200000, 100, 500),
    'Biochemical Oxygen Demand': (2, 1, 1, 5, 0.1, 0.2),
    'Escherichia coli': (20, 10, 0, 100, 0, 0),
    'Enterococcus': (15, 8, 0, 80, 1, 5)     
}

#### Key
**'parameter [unit]': (baseline value, variation, min_value, max_value, small_min, small_max)**
* **baseline value:** realistic median value for parameter
* **variation:** fluctuation around baseline value
* **min_value:** lower limit of realistic values
* **max_value:** upper limit of realistic values
* **small_min:** lower limit of noise
* **small_max:** upper limit of noise

## Generate Synthetic Data

In [4]:
# Set random seed for reproducibility
np.random.seed(42)

# Function to generate synthetic values with small variation
def synthetic_values(base, variation, min_value, max_value, small_min, small_max):
    value = round(max(min(base + np.random.uniform(-variation, variation), max_value), min_value), 3)
    value += round(np.random.uniform(small_min, small_max), 3)
    return round(value, 3)

# Function to apply event impact on parameters
def event(location, date, param, base_value):
    start_date = pd.Timestamp("2024-02-05")

    if locations[location] == 'mine_leak':
        if date < pd.Timestamp("2024-05-01"):
            days_since_start = (date - start_date).days
            total_days = (pd.Timestamp("2024-05-01") - start_date).days
            decay_factor = 1 - (days_since_start / total_days)
            
            if param == 'Phosphate' or param == 'Chlorophyll-a':
                return base_value * (1 + decay_factor)
            elif param == 'Silicate':
                return base_value * (1 - decay_factor)

    elif locations[location] == 'sewage_spill':
        event_date = pd.Timestamp("2024-09-26")
        if date >= event_date:
            days_since_event = (date - event_date).days
            if days_since_event == 0:
                if param in ['Phosphate', 'Nitrate', 'Escherichia coli', 'Enterococcus', 'Biochemical Oxygen Demand']:
                    return base_value * 5
                elif param == 'Dissolved Oxygen':
                    return base_value * 0.5
            else:
                decay_factor = np.exp(-days_since_event / 30)
                if param in ['Phosphate', 'Nitrate', 'Escherichia coli', 'Enterococcus', 'Biochemical Oxygen Demand']:
                    return base_value * (1 + decay_factor)
                elif param == 'Dissolved Oxygen':
                    return base_value * (1 - decay_factor)

    return base_value

# Generate data for the past year ending on February 4th, 2025
end_date = pd.Timestamp('2025-02-04')
date_range = pd.date_range(end=end_date, periods=365)

# Create an empty list to store long-format data
data = []

# Loop through each location and date to generate data
for location in locations.keys():
    for date in date_range:
        time_str = '12:00:00'
        for param, values in parameters.items():
            base, variation, min_value, max_value, small_min, small_max = values
            base_value = synthetic_values(base, variation, min_value, max_value, small_min, small_max)
            adjusted_value = event(location, date, param, base_value)
            data.append([date.strftime('%Y-%m-%d'), time_str, location, param, round(adjusted_value, 3)])

# Convert to DataFrame
synthetic_data = pd.DataFrame(data, columns=['Date', 'Time', 'Location ID', 'Parameter ID', 'Value'])

# Normalization through mapping
location_mapping = {
    'Sanibel Island': 1,
    'Boca Grande': 2,
    'Venice': 3,
    'Siesta Key': 4,
    'St. Pete Beach': 5,
    'Indian Shores': 6,
    'Clearwater Beach': 7
}

parameter_mapping = {
    'Water Temperature': 1,
    'pH': 2,
    'Salinity': 3,
    'Dissolved Oxygen': 4,
    'Nitrate': 5,
    'Phosphate': 6,
    'Silicate': 7,
    'Chlorophyll-a': 8,
    'Karenia brevis': 9,
    'Biochemical Oxygen Demand': 10,
    'Escherichia coli': 11,
    'Enterococcus': 12
}

# Replace location and parameter names with IDs
synthetic_data['Location ID'] = synthetic_data['Location ID'].map(location_mapping)
synthetic_data['Parameter ID'] = synthetic_data['Parameter ID'].map(parameter_mapping)

# Save to CSV
synthetic_data.to_csv('FactData.csv', index=False)

# Print sample data
print(synthetic_data.head(25))

          Date      Time  Location ID  Parameter ID    Value
0   2024-02-06  12:00:00            1             1   25.498
1   2024-02-06  12:00:00            1             2    8.239
2   2024-02-06  12:00:00            1             3   31.092
3   2024-02-06  12:00:00            1             4    8.348
4   2024-02-06  12:00:00            1             5    0.078
5   2024-02-06  12:00:00            1             6    0.097
6   2024-02-06  12:00:00            1             7    0.040
7   2024-02-06  12:00:00            1             8    4.611
8   2024-02-06  12:00:00            1             9  309.903
9   2024-02-06  12:00:00            1            10    1.993
10  2024-02-06  12:00:00            1            11   22.237
11  2024-02-06  12:00:00            1            12   14.139
12  2024-02-07  12:00:00            1             1   25.824
13  2024-02-07  12:00:00            1             2    7.920
14  2024-02-07  12:00:00            1             3   33.600
15  2024-02-07  12:00:00