### create fake data with specific patterns

In [2]:
import random
import string
import pandas as pd

def generate_id():
    return random.randint(100000, 999999)  # 6-digit random ID (you can change the range)

def generate_custom_string():
    uppercase_part = ''.join(random.choices(string.ascii_uppercase, k=3))
    digit_part = ''.join(random.choices(string.digits, k=4))
    special_part = ''.join(random.choices('!@#$%^&*', k=3))
    lowercase_part = ''.join(random.choices(string.ascii_lowercase, k=10))
    return f"{uppercase_part}{digit_part}{special_part}{lowercase_part}"

rows = [{'ID': generate_id(), 'synthetic_column': generate_custom_string()} for _ in range(100000)]

# Create a DataFrame
df = pd.DataFrame(rows)
df

# # Optional: Save to CSV
# df.to_csv('synthetic_dataset_with_id.csv', index=False)

Unnamed: 0,ID,synthetic_column
0,380423,VPM2947!@@uxuebnivjk
1,952627,BKW7756%%@gpghphumjh
2,697086,EVG1325@*&spevdbwhgq
3,678070,XKZ3054#!$drowgkvzuf
4,185240,PVB0694#$*ijikqvmuqs
...,...,...
99995,179954,PUO1973&$*kpkzjuljwi
99996,341554,RAS3451!*^wfubgnvkip
99997,576065,WSF5324$$#nogrqxuavg
99998,599654,MSH1917$^%keraiitufy


### create fake PII data

In [4]:
from faker import Faker

fake = Faker()

print(fake.name())         # e.g., 'Christopher White'
print(fake.address())      # e.g., '638 Matthews Overpass North Courtneyborough, MH 46817'
print(fake.email())        # e.g., 'andersonjuan@example.com'
print(fake.company())      # e.g., 'Douglas Group'
print(fake.date_of_birth()) # e.g., '1924-07-06'

users = []

for _ in range(10000):
    user = {
        'name': fake.name(),
        'email': fake.email(),
        'address': fake.address(),
        'dob': fake.date_of_birth()
    }
    users.append(user)

len(users)

Mrs. Tracey Griffin
52986 Mahoney Oval Apt. 159
Craigbury, CT 70730
qadams@example.org
Taylor-Garcia
1987-12-28


10000

### create fake data from ML libraries SDV

In [6]:
import pandas as pd
import numpy as np
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import SingleTableMetadata

In [7]:
# Create a sample healthcare dataset (real data to model from)
real_healthcare_data = pd.DataFrame({
    'Age': np.random.randint(20, 90, size=500),
    'Gender': np.random.choice(['Male', 'Female'], size=500),
    'Height_cm': np.random.normal(165, 10, size=500),
    'Weight_kg': np.random.normal(70, 15, size=500),
    'Diagnosis': np.random.choice(['Hypertension', 'Diabetes', 'Asthma', 'Healthy'], size=500),
    'Smoker': np.random.choice(['Yes', 'No'], size=500),
    'Systolic_BP': np.random.normal(120, 15, size=500),
    'Diastolic_BP': np.random.normal(80, 10, size=500)
})

# Show a sample
real_healthcare_data

Unnamed: 0,Age,Gender,Height_cm,Weight_kg,Diagnosis,Smoker,Systolic_BP,Diastolic_BP
0,30,Male,175.320483,89.555159,Diabetes,No,140.658295,98.826713
1,69,Male,146.444450,66.121372,Asthma,No,125.719460,82.011421
2,83,Male,161.444267,65.798838,Asthma,No,117.313354,92.151582
3,89,Male,160.084798,63.373337,Asthma,Yes,103.958947,73.975228
4,80,Male,170.516833,80.393534,Hypertension,Yes,107.786161,58.341331
...,...,...,...,...,...,...,...,...
495,35,Female,165.937462,69.371188,Healthy,Yes,126.106263,90.571889
496,60,Male,186.093161,40.986037,Hypertension,Yes,82.478452,94.219250
497,58,Male,171.831915,66.317973,Hypertension,Yes,120.095936,67.316913
498,44,Female,165.100164,51.460262,Asthma,No,117.223104,78.509134


In [8]:
# Define metadata
metadata = SingleTableMetadata()
metadata.detect_from_dataframe(real_healthcare_data)

# Fit synthesizer
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(real_healthcare_data)

# Generate synthetic data
synthetic_data = synthesizer.sample(5000)
synthetic_data



Unnamed: 0,Age,Gender,Height_cm,Weight_kg,Diagnosis,Smoker,Systolic_BP,Diastolic_BP
0,71,Male,176.441527,70.640757,Healthy,No,121.861647,77.086952
1,27,Male,161.016303,71.192454,Hypertension,Yes,147.679475,72.551174
2,68,Male,174.603106,66.307812,Asthma,Yes,124.512247,91.420481
3,54,Male,169.315378,65.522087,Healthy,Yes,110.106245,76.418153
4,35,Female,171.742932,45.972792,Healthy,Yes,112.679768,86.819897
...,...,...,...,...,...,...,...,...
4995,88,Female,181.655462,89.851142,Diabetes,Yes,137.093145,103.781141
4996,89,Male,157.039652,69.659638,Asthma,Yes,129.821372,71.639609
4997,82,Male,190.852007,32.173422,Healthy,No,100.589490,69.262880
4998,23,Female,158.527394,69.277441,Asthma,No,113.677242,92.320923


In [9]:
from sdv.evaluation.single_table import run_diagnostic

diagnostic_report = run_diagnostic(
    real_data=real_healthcare_data,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Data Validity: |█████████████████████████████████████████████████████| 8/8 [00:00<00:00, 1143.17it/s]|
Data Validity Score: 100.0%

(2/2) Evaluating Data Structure: |█████████████████████████████████████████████████████| 1/1 [00:00<00:00, 333.60it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 100.0%



In [10]:
from sdv.evaluation.single_table import evaluate_quality

quality_report = evaluate_quality(
    real_data=real_healthcare_data,
    synthetic_data=synthetic_data,
    metadata=metadata)

Generating report ...

(1/2) Evaluating Column Shapes: |██████████████████████████████████████████████████████| 8/8 [00:00<00:00, 571.43it/s]|
Column Shapes Score: 97.17%

(2/2) Evaluating Column Pair Trends: |███████████████████████████████████████████████| 28/28 [00:00<00:00, 337.86it/s]|
Column Pair Trends Score: 94.61%

Overall Score (Average): 95.89%



In [20]:
from sdv.evaluation.single_table import get_column_plot

fig = get_column_plot(
    real_data=real_healthcare_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_name='Height_cm'
)
    
fig.show()

In [18]:
from sdv.evaluation.single_table import get_column_pair_plot

fig = get_column_pair_plot(
    real_data=real_healthcare_data,
    synthetic_data=synthetic_data,
    metadata=metadata,
    column_names=['Systolic_BP', 'Smoker'],
    )
    
fig.show()