In [32]:
import ibis

# Create a connection to the DuckDB database
con = ibis.duckdb.connect(database="student_performance.duckdb")
# Create the tables in the DuckDB database
test_table = con.read_csv("student-performance/test.csv", table_name="test")
train_table = con.read_csv("student-performance/train.csv", table_name="train")
validation_table = con.read_csv("student-performance/validation.csv", table_name="validation")

all_tables = test_table.union(train_table).union(validation_table)


In [None]:
from sdv.single_table import GaussianCopulaSynthesizer
from sdv.metadata import Metadata

all_tables_pandas = all_tables.to_pandas()
metadata = Metadata.detect_from_dataframe(all_tables_pandas)
metadata.save("metadata.json")

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [None]:
synthesizer = GaussianCopulaSynthesizer(metadata)
synthesizer.fit(all_tables_pandas)
synthesizer.save("GaussianCopulaSynthesizer.pkl")


We strongly recommend saving the metadata using 'save_to_json' for replicability in future SDV versions.



In [35]:
synthetic_data = synthesizer.sample(num_rows=1000)
synthetic_data.head()

Unnamed: 0,Age,Grade,Gender,Race,SES_Quartile,ParentalEducation,SchoolType,Locale,TestScore_Math,TestScore_Reading,...,GPA,AttendanceRate,StudyHours,InternetAccess,Extracurricular,PartTimeJob,ParentSupport,Romantic,FreeTime,GoOut
0,14,9,Female,White,1,<HS,Public,Suburban,64.194801,69.38578,...,2.465075,0.868951,0.706296,1,1,0,0,0,3,1
1,18,9,Male,White,2,SomeCollege,Public,Rural,76.287725,78.68544,...,2.750959,0.876394,0.740991,1,0,0,1,0,1,2
2,18,9,Male,Hispanic,2,SomeCollege,Public,Town,82.439512,67.883948,...,3.395841,0.904526,1.236483,1,1,0,0,0,4,1
3,14,9,Female,White,2,HS,Private,City,53.146273,61.847485,...,2.441441,0.7917,0.776182,1,0,0,1,1,3,1
4,14,12,Male,Hispanic,1,SomeCollege,Public,Suburban,82.211638,80.871048,...,3.878573,0.954287,1.299302,1,1,0,1,0,3,3


In [None]:
from tqdm import tqdm
# Number of total rows and chunk size
total_rows = 50_000_000
chunk_size = 1_000_000

synthesizer.load("GaussianCopulaSynthesizer.pkl")

# Generate and insert the first chunk, creating the table
first_chunk = synthesizer.sample(chunk_size)
con.create_table("synthetic", first_chunk)

# Generate and append the remaining chunks
for _ in tqdm(range((total_rows // chunk_size) - 1)):
    chunk = synthesizer.sample(chunk_size)
    con.insert("synthetic", chunk)

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

100%|██████████| 49/49 [14:43<00:00, 18.02s/it]


In [39]:
# Save the Data as CSV

con.raw_sql("COPY synthetic TO 'synthetic.csv' (HEADER, DELIMITER ',')")


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

<duckdb.duckdb.DuckDBPyConnection at 0x16143f230>

In [40]:
test = ibis.read_csv("synthetic.csv")

In [44]:
test.count().execute()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

50000000