In [1]:
from sdv.single_table import CTGANSynthesizer
from sdv.metadata import SingleTableMetadata
import pandas as pd
from sdv.metadata import SingleTableMetadata
from sdv.sequential import PARSynthesizer
from sdv.single_table import GaussianCopulaSynthesizer
import numpy as np
from tabulate import tabulate

In [2]:
data = pd.read_csv('imputed.csv')

GaussianCopulaSynthesizer

In [3]:
# Step 2: Set Up Metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)

# Explicitly set column types
metadata.update_column('Date/Time', sdtype='datetime')
metadata.update_column('Date/time end', sdtype='datetime')
metadata.update_column('Size fraction', sdtype='categorical')

stats_data = data.iloc[:, 5:]

for col in stats_data:
    if col in data.columns:
        metadata.update_column(column_name=col, sdtype='numerical')
    else:
        print(f"Warning: Column {col} not found in dataset.")

# Step 3: Initialize and Fit the Gaussian Copula Model
model = GaussianCopulaSynthesizer(metadata=metadata)
model.fit(data)

# Step 4: Generate Synthetic Data
synthetic_data = model.sample(num_rows=1000)

# Step 5: Save Synthetic Data
synthetic_data.to_csv('synthetic_data_gaussian_copula.csv', index=False)
print("Synthetic data generation complete. Saved to 'synthetic_data_gaussian_copula.csv'.")
print(synthetic_data.shape)

 Table Name   Column Name   sdtype datetime_format
     table     Date/Time datetime            None
     table Date/time end datetime            None
Without this specification, SDV may not be able to accurately parse the data. We recommend adding datetime formats using 'update_column'.


InvalidDataError: The provided data does not match the metadata:
Errors in table:
Error: Invalid values found for numerical column 'Size fraction': ['0.05 - 0.14 µm', '0.14 - 0.42 µm', '0.42 - 1.2 µm', '+ 2 more'].

In [None]:
data = pd.read_csv('data.csv')
#replacing non-detection values with 0 for original data
new_data = data.copy()

cols_new = new_data.columns[6:]

def replace_non_detect(series):
    series = series.astype(str).str.strip()  
    series = series.replace(to_replace=r'^<.*', value="0", regex=True) 
    return pd.to_numeric(series, errors='coerce') 

for col in cols_new:
    new_data[col] = replace_non_detect(new_data[col])

#mean and standard deviation  for original
mean_org = new_data.mean(numeric_only=True).round(3)
std_org  = new_data.std(numeric_only=True).round(3)


#mean and standard deviation  for cleaned data 
mean_sub = synthetic_data.mean(numeric_only=True).round(3)
std_sub  = synthetic_data.std(numeric_only=True).round(3)

# Combine into a single DataFrame
summary_df = pd.concat([mean_org, std_org, mean_sub, std_sub], axis=1)      \
               .reset_index()                      \
               .rename(columns={
                   'index': 'Column',
                   0: 'Mean (Original)',
                   1: 'Std (Original)',
                   2: 'Mean (cleaned)',
                   3: 'std (cleaned)'
               })

print(summary_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          629 non-null    int64  
 1   No                  629 non-null    int64  
 2   Date/Time           629 non-null    object 
 3   Date/time end       629 non-null    object 
 4   Altitude [m]        629 non-null    int64  
 5   Size fraction       629 non-null    object 
 6   Mass v [µg/m**3]    629 non-null    float64
 7   Na+ [µg/m**3]       629 non-null    float64
 8   [NH4]+ [µg/m**3]    629 non-null    float64
 9   K+ [µg/m**3]        629 non-null    float64
 10  Mg2+ [µg/m**3]      629 non-null    float64
 11  Ca2+ [µg/m**3]      629 non-null    float64
 12  Cl- [µg/m**3]       629 non-null    float64
 13  [NO3]- [µg/m**3]    629 non-null    float64
 14  [SO4]2- [µg/m**3]   629 non-null    float64
 15  [C2O4]2- [µg/m**3]  629 non-null    float64
 16  Br- [µg/

RMSE and MAE

CTGAN

In [None]:
# Load the original dataset
data = pd.read_csv('imputed.csv')

# Convert date columns to datetime
data['Date/Time'] = pd.to_datetime(data['Date/Time'])
data['Date/time end'] = pd.to_datetime(data['Date/time end'])

# Compute duration in seconds
data['Duration'] = (data['Date/time end'] - data['Date/Time']).dt.total_seconds()

# Drop the end date column
data = data.drop(columns=['Date/time end'])

# Create and update metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data)
metadata.update_column(column_name='Date/Time', sdtype='datetime')
metadata.update_column(column_name='Duration', sdtype='numerical')

# Initialize and train the CTGAN model
model = CTGANSynthesizer(metadata=metadata)
model.fit(data)

# Generate synthetic data
synthetic_data_ctgan = model.sample(num_rows=1000)

# Ensure duration is non-negative
synthetic_data_ctgan['Duration'] = synthetic_data_ctgan['Duration'].clip(lower=0)

# Compute "Date/time end" in synthetic data
synthetic_data_ctgan['Date/time end'] = synthetic_data_ctgan['Date/Time'] + pd.to_timedelta(synthetic_data_ctgan['Duration'], unit='s')

# Save the corrected synthetic data
synthetic_data_ctgan.to_csv('corrected_synthetic_data.csv', index=False)
print("Corrected synthetic data saved to 'corrected_synthetic_data.csv'.")



Corrected synthetic data saved to 'corrected_synthetic_data.csv'.


In [None]:
syn_data = pd.read_csv('corrected_synthetic_data.csv')
data = pd.read_csv('data.csv')
#replacing non-detection values with 0 for original data
new_data = data.copy()

cols_new = new_data.columns[6:]

def replace_non_detect(series):
    series = series.astype(str).str.strip()  
    series = series.replace(to_replace=r'^<.*', value="0", regex=True) 
    return pd.to_numeric(series, errors='coerce') 

for col in cols_new:
    new_data[col] = replace_non_detect(new_data[col])

new_data.info()
#mean and standard deviation  for original
mean_org = new_data.mean(numeric_only=True).round(3)
std_org  = new_data.std(numeric_only=True).round(3)


#mean and standard deviation  for cleaned data 
mean_new = syn_data.mean(numeric_only=True).round(3)
std_new  = syn_data.std(numeric_only=True).round(3)

# Combine into a single DataFrame
summary_df = pd.concat([mean_org, std_org, mean_new, std_new], axis=1)      \
               .reset_index()                      \
               .rename(columns={
                   'index': 'Column',
                   0: 'Mean (Original)',
                   1: 'Std (Original)',
                   2: 'Mean (cleaned)',
                   3: 'std (cleaned)'
               })

print(summary_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          629 non-null    int64  
 1   No                  629 non-null    int64  
 2   Date/Time           629 non-null    object 
 3   Date/time end       629 non-null    object 
 4   Altitude [m]        629 non-null    int64  
 5   Size fraction       629 non-null    object 
 6   Mass v [µg/m**3]    629 non-null    float64
 7   Na+ [µg/m**3]       629 non-null    float64
 8   [NH4]+ [µg/m**3]    629 non-null    float64
 9   K+ [µg/m**3]        629 non-null    float64
 10  Mg2+ [µg/m**3]      629 non-null    float64
 11  Ca2+ [µg/m**3]      629 non-null    float64
 12  Cl- [µg/m**3]       629 non-null    float64
 13  [NO3]- [µg/m**3]    629 non-null    float64
 14  [SO4]2- [µg/m**3]   629 non-null    float64
 15  [C2O4]2- [µg/m**3]  629 non-null    float64
 16  Br- [µg/

Deepecho

In [None]:
import pandas as pd
from sdv.metadata import SingleTableMetadata

# Load the dataset
real_data = pd.read_csv('imputed.csv', parse_dates=['Date/Time', "Date/time end"])

# Create and detect metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_data)

# Update column types if needed
metadata.update_column(column_name='Unnamed: 0', sdtype='id')

# Set sequence key
metadata.set_sequence_key('Unnamed: 0')

# Set sequence index to a single column, e.g., 'Date/Time'
metadata.set_sequence_index('Date/Time')  # Or 'Date/time end' if preferred

In [None]:
synthesizer = PARSynthesizer(metadata)
synthesizer.fit(real_data)
synthetic_data_deepecho = synthesizer.sample(num_sequences=1000)
synthetic_data_deepecho.to_csv('synthetic_data_deepecho.csv', index=False)

  .apply(lambda x: x[self._sequence_index].diff().bfill())


In [None]:
synthetic_data_deepecho.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 38 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Unnamed: 0.1        1000 non-null   int64         
 1   Unnamed: 0          1000 non-null   int64         
 2   No                  1000 non-null   int64         
 3   Date/Time           1000 non-null   datetime64[ns]
 4   Date/time end       1000 non-null   datetime64[ns]
 5   Altitude [m]        1000 non-null   int64         
 6   Size fraction       1000 non-null   object        
 7   Mass v [µg/m**3]    1000 non-null   float64       
 8   Na+ [µg/m**3]       1000 non-null   float64       
 9   [NH4]+ [µg/m**3]    1000 non-null   float64       
 10  K+ [µg/m**3]        1000 non-null   float64       
 11  Mg2+ [µg/m**3]      1000 non-null   float64       
 12  Ca2+ [µg/m**3]      1000 non-null   float64       
 13  Cl- [µg/m**3]       1000 non-null   float64      

In [None]:
syn_data = pd.read_csv('synthetic_data_deepecho.csv')
data = pd.read_csv('data.csv')
#replacing non-detection values with 0 for original data
new_data = data.copy()

cols_new = new_data.columns[6:]

def replace_non_detect(series):
    series = series.astype(str).str.strip()  
    series = series.replace(to_replace=r'^<.*', value="0", regex=True) 
    return pd.to_numeric(series, errors='coerce') 

for col in cols_new:
    new_data[col] = replace_non_detect(new_data[col])

new_data.info()
#mean and standard deviation  for original
mean_org = new_data.mean(numeric_only=True).round(3)
std_org  = new_data.std(numeric_only=True).round(3)


#mean and standard deviation  for cleaned data 
mean_new = syn_data.mean(numeric_only=True).round(3)
std_new  = syn_data.std(numeric_only=True).round(3)

# Combine into a single DataFrame
summary_df = pd.concat([mean_org, std_org, mean_new, std_new], axis=1)      \
               .reset_index()                      \
               .rename(columns={
                   'index': 'Column',
                   0: 'Mean (Original)',
                   1: 'Std (Original)',
                   2: 'Mean (cleaned)',
                   3: 'std (cleaned)'
               })

print(summary_df)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 629 entries, 0 to 628
Data columns (total 37 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Unnamed: 0          629 non-null    int64  
 1   No                  629 non-null    int64  
 2   Date/Time           629 non-null    object 
 3   Date/time end       629 non-null    object 
 4   Altitude [m]        629 non-null    int64  
 5   Size fraction       629 non-null    object 
 6   Mass v [µg/m**3]    629 non-null    float64
 7   Na+ [µg/m**3]       629 non-null    float64
 8   [NH4]+ [µg/m**3]    629 non-null    float64
 9   K+ [µg/m**3]        629 non-null    float64
 10  Mg2+ [µg/m**3]      629 non-null    float64
 11  Ca2+ [µg/m**3]      629 non-null    float64
 12  Cl- [µg/m**3]       629 non-null    float64
 13  [NO3]- [µg/m**3]    629 non-null    float64
 14  [SO4]2- [µg/m**3]   629 non-null    float64
 15  [C2O4]2- [µg/m**3]  629 non-null    float64
 16  Br- [µg/

In [None]:
syn_data = pd.read_csv('synthetic_data_deepecho.csv')
data = pd.read_csv('data.csv')
#replacing non-detection values with 0 for original data
new_data = data.copy()

cols_new = new_data.columns[6:]

results = []

# Calculate RMSE and MAE for each column
for col in cols_new:
    # Get column values
    values = new_data[col].values
    
    # Calculate mean of the column as the reference value
    mean_value = np.mean(values)
    
    # Calculate MAE: mean of absolute differences from the mean
    mae = np.mean(np.abs(values - mean_value))
    
    # Calculate RMSE: square root of mean of squared differences from the mean
    rmse = np.sqrt(np.mean((values - mean_value) ** 2))
    
    # Determine unit based on column name
    unit = 'µg/m**3' if '[µg/m**3]' in col else 'ng/m**3'
    
    # Append results
    results.append([col, f'{rmse:.4f}', f'{mae:.4f}', unit])

def replace_non_detect(series):
    series = series.astype(str).str.strip()  
    series = series.replace(to_replace=r'^<.*', value="0", regex=True) 
    return pd.to_numeric(series, errors='coerce') 

for col in cols_new:
    new_data[col] = replace_non_detect(new_data[col])

new_data.info()
#mean and standard deviation  for original
mean_org = new_data.mean(numeric_only=True).round(3)
std_org  = new_data.std(numeric_only=True).round(3)


#mean and standard deviation  for cleaned data 
mean_deep = syn_data_deepecho.mean(numeric_only=True).round(3)
std_deep  = syn_data_deepecho.std(numeric_only=True).round(3)

# Combine into a single DataFrame
summary_df_deep = pd.concat([mean_org, std_org, mean_deep,std_deep], axis=1)      \
               .reset_index()                      \
               .rename(columns={
                   'index': 'Column',
                   0: 'Mean (Original)',
                   1: 'Std (Original)',
                   2: 'Mean (cleaned)',
                   3: 'std (cleaned)',
               })

print(summary_df_deep)

NameError: name 'synthetic_data_deepecho' is not defined

In [None]:
import pandas as pd
import numpy as np

def compute_rmse_mae(original_df, synthetic_df, start_column_index=5, merge_keys=['Date/Time', 'Size fraction']):
    """
    Compute RMSE and MAE for each numerical column in the synthetic dataset compared to the original dataset,
    starting from the specified column index, after merging on specified keys.

    Parameters:
    - original_df (pd.DataFrame): The original dataset.
    - synthetic_df (pd.DataFrame): The synthetic dataset.
    - start_column_index (int): The index of the first column to compare (0-based indexing, default=5).
    - merge_keys (list): Columns to merge the datasets on (default=['Date/Time', 'Size fraction']).

    Returns:
    - pd.DataFrame: A DataFrame with RMSE and MAE for each numerical column.
    """
    # Merge datasets on specified keys
    merged_df = pd.merge(original_df, synthetic_df, on=merge_keys, suffixes=('_original', '_synthetic'), how='inner')
    if merged_df.empty:
        raise ValueError("No matching rows after merging on specified keys.")

    # Select numerical columns starting from the specified index in original_df
    all_cols = original_df.columns[start_column_index:]
    exclude_cols = ['Size fraction', 'Date/time end']  # Non-numerical columns to exclude
    cols_to_use = [col for col in all_cols if col not in exclude_cols and pd.api.types.is_numeric_dtype(original_df[col])]

    # Initialize results dictionary
    results = {}

    # Compute RMSE and MAE for each column
    for col in cols_to_use:
        original_col = f"{col}_original"
        synthetic_col = f"{col}_synthetic"
        if original_col not in merged_df.columns or synthetic_col not in merged_df.columns:
            continue
        original_values = merged_df[original_col].values
        synthetic_values = merged_df[synthetic_col].values

        # Handle NaN values
        mask = ~np.isnan(original_values) & ~np.isnan(synthetic_values)
        original_values = original_values[mask]
        synthetic_values = synthetic_values[mask]

        if len(original_values) == 0:
            results[col] = {'RMSE': np.nan, 'MAE': np.nan}
            continue

        # Compute RMSE and MAE
        rmse = np.sqrt(np.mean((original_values - synthetic_values) ** 2))
        mae = np.mean(np.abs(original_values - synthetic_values))

        results[col] = {'RMSE': rmse, 'MAE': mae}

    # Convert results to DataFrame
    return pd.DataFrame(results).T

# Example usage
if __name__ == "__main__":
    # Load datasets (replace 'original_data.csv' with your actual file)
    synthetic_df = pd.read_csv('corrected_synthetic_data.csv')
    original_df = pd.read_csv('imputed.csv')  # Replace with your original dataset file

    # Compute RMSE and MAE starting from the 6th column (index 5)
    results = compute_rmse_mae(original_df, synthetic_df, start_column_index=5)
    print("\nRMSE and MAE for each numerical column starting from column 6:")
    print(results)


RMSE and MAE for each numerical column starting from column 6:
                           RMSE         MAE
Altitude [m]           0.000000    0.000000
Mass v [µg/m**3]      39.229466   28.151763
Na+ [µg/m**3]          3.076338    1.864696
[NH4]+ [µg/m**3]       0.035825    0.019564
K+ [µg/m**3]           0.127395    0.081489
Mg2+ [µg/m**3]         0.702220    0.478238
Ca2+ [µg/m**3]         0.503572    0.337298
Cl- [µg/m**3]          5.401331    3.252461
[NO3]- [µg/m**3]       0.336604    0.237465
[SO4]2- [µg/m**3]      0.953144    0.613610
[C2O4]2- [µg/m**3]     0.033466    0.021987
Br- [µg/m**3]          0.008861    0.004604
C org [µg/m**3]        0.562278    0.305631
EC [µg/m**3]           0.127553    0.069084
TC [µg/m**3]           0.311811    0.192546
Ca [ng/m**3]        1271.520330  594.301365
Ti [ng/m**3]         155.119878   61.549907
V [ng/m**3]            2.483974    0.774339
Cr [ng/m**3]           1.051205    0.548924
Mn [ng/m**3]          50.031910   17.917408
Fe [ng/m**3]