In [None]:
#Uncomment to install ydata-synthetic lib
#!pip install ydata-synthetic

# Tabular Synthetic Data Generation with Gaussian Mixture
- This notebook is an example of how to use a synthetic data generation methods based on [GMM](https://scikit-learn.org/stable/modules/generated/sklearn.mixture.GaussianMixture.html) to generate synthetic tabular data with numeric and categorical features.

## Dataset
- The data used is the [Adult Census Income](https://www.kaggle.com/datasets/uciml/adult-census-income) which we will fecth by importing the `pmlb` library (a wrapper for the Penn Machine Learning Benchmark data repository).


In [None]:
from pmlb import fetch_data

from ydata_synthetic.synthesizers.regular import RegularSynthesizer
from ydata_synthetic.synthesizers import ModelParameters, TrainParameters

## Load the data

In [2]:
# Load data
data = fetch_data('adult')
num_cols = ['age', 'fnlwgt', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_cols = ['workclass','education', 'education-num', 'marital-status', 'occupation', 'relationship', 'race', 'sex',
            'native-country', 'target']

## Create and Train the synthetic data generator

In [None]:
synth = RegularSynthesizer(modelname='fast')
synth.fit(data=data, num_cols=num_cols, cat_cols=cat_cols)

## Generate new synthetic data

In [8]:
synth_data = synth.sample(1000)
print(synth_data)

           age  workclass         fnlwgt  education  education-num  \
0    38.753654          4  179993.565472          8           10.0   
1    36.408844          4  245841.807958          9           10.0   
2    56.251066          4  400895.076058         11           13.0   
3    26.846605          4  240156.201048         11           10.0   
4    29.083102          1    5601.059126         11            9.0   
..         ...        ...            ...        ...            ...   
995  79.281276          4   30664.183560          1           10.0   
996  51.423132          4  414524.980527          1           10.0   
997  17.342915          6  177716.451926         11           13.0   
998  39.298867          4  132011.369567         15           12.0   
999  46.977763          2   92662.371635          9           13.0   

     marital-status  occupation  relationship  race  sex  capital-gain  \
0                 4           0             3     4    0     55.771499   
1          