# Installing Necessary Libraries


In [2]:
!pip install -r requirements.txt



In [4]:
import platform
print(platform.python_version())

3.11.9



# Loading The Data

In [3]:
import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv(r'data\training_data\startup_data.csv')
data = df


# Creating the Metadata Necessary for Synthesizing Data  

In [4]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()

In [5]:
metadata.detect_from_csv(filepath=r'data\training_data\startup_data.csv')

In [6]:
python_dict = metadata.to_dict()

In [7]:
metadata.validate()

# Saving the Metadata in a JSON File

In [9]:
metadata.save_to_json(filepath=r'data\training_data\metadata\my_metadata_v6.json')

In [11]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata.load_from_json(
    filepath=r'data\training_data\metadata\my_metadata_v6.json')

# Constructing a CTGANSynthesizer


In [12]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer(
    metadata, # required
    enforce_rounding=False,
    epochs=500,
    verbose=True
)


In [13]:
synthesizer.get_parameters()

{'enforce_min_max_values': True,
 'enforce_rounding': False,
 'locales': None,
 'embedding_dim': 128,
 'generator_dim': (256, 256),
 'discriminator_dim': (256, 256),
 'generator_lr': 0.0002,
 'generator_decay': 1e-06,
 'discriminator_lr': 0.0002,
 'discriminator_decay': 1e-06,
 'batch_size': 500,
 'discriminator_steps': 1,
 'log_frequency': True,
 'verbose': True,
 'epochs': 500,
 'pac': 10,
 'cuda': True}

In [14]:
{
    'enforce_rounding': False,
    'epochs': 500,
    'verbose': True
}

{'enforce_rounding': False, 'epochs': 500, 'verbose': True}

In [15]:
metadata = synthesizer.get_metadata()

# Training the Synthesizer

In [19]:
synthesizer.fit(data)

Gen. (-2.19) | Discrim. (-0.02): 100%|██████████| 500/500 [01:52<00:00,  4.44it/s]


# Saving the Trained GAN into a Pickle File

In [20]:
synthesizer.save(
    filepath=r'GAN_model\trained_GAN.pkl'
)

# Generating Synthetic Data

In [21]:
from sdv.single_table import CTGANSynthesizer

synthesizer = CTGANSynthesizer.load(
    filepath=r'GAN_model\trained_GAN.pkl')

In [22]:
synthetic_data = synthesizer.sample(num_rows=2000)

# Saving the Data as CSV File

In [23]:
synthetic_data.to_csv(r'outputs\synthetic_data5.csv', index=True)

# Evaluating the Generated Synthetic Data

In [24]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    data,
    synthetic_data,
    metadata
)

Generating report ...
(1/2) Evaluating Column Shapes: : 100%|██████████| 49/49 [00:00<00:00, 446.12it/s]
(2/2) Evaluating Column Pair Trends: : 100%|██████████| 1176/1176 [00:12<00:00, 92.72it/s] 

Overall Quality Score: 82.53%

Properties:
- Column Shapes: 85.76%
- Column Pair Trends: 79.29%


# KSComplement
This metric computes the similarity of a real column vs. a synthetic column in terms of the column shapes -- aka the marginal distribution or 1D histogram of the column.
## Score
(best) 1.0: The real data is exactly the same as the synthetic data

---


(worst) 0.0: The real and synthetic data are as different as they can be

In [25]:
from sdmetrics.single_column import KSComplement

KSComplement.compute(
    real_data=data['avg_participants'],
    synthetic_data=synthetic_data['avg_participants']
)

0.7077643553629469

# Missing Value Similarity
This metric compares whether the synthetic data has the same proportion of missing values as the real data for a given column.
## Score
(best) 1.0: The synthetic data perfectly captures the proportion of missing values

---


(worst) 0.0: The synthetic data has a completely different proportion of missing values than the real data

In [32]:
from sdmetrics.single_column import MissingValueSimilarity

MissingValueSimilarity.compute(
    real_data=data['labels'],
    synthetic_data=synthetic_data['labels']
)

1.0

# StatisticSimilarity
This metric measures the similarity between a real column and a synthetic column by comparing a summary statistic. Supported summary statistics are: mean, median and standard deviation.

## Score
(best) 1.0: The statistic for the real data is exactly the same at the synthetic data

---


(worst) 0.0: The statistic for the real data is extremely different from the synthetic data

In [33]:
from sdmetrics.single_column import StatisticSimilarity

StatisticSimilarity.compute(
    real_data=data['labels'],
    synthetic_data=synthetic_data['labels'],
    statistic='mean'
)

0.9296960996749729

# TVComplement
This metric computes the similarity of a real column vs. a synthetic column in terms of the column shapes -- aka the marginal distribution or 1D histogram of the column.

## Score
(best) 1.0: The real data is exactly the same as the synthetic data

---


(worst) 0.0: The real and synthetic data are as different as they can be

In [34]:
from sdmetrics.single_column import TVComplement

TVComplement.compute(
    real_data=data['state_code'],
    synthetic_data=synthetic_data['state_code'],
)

0.9038645720476707