In [20]:
import os
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from datetime import datetime

In [2]:
raw_data = pd.read_csv('./dataset/heart_disease_uci.csv')

In [3]:
raw_data.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,created_timestamp
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,2025-01-01 00:00:00.000000
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,2025-01-01 00:00:00.000000
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,2025-01-01 00:00:00.000000
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,2025-01-01 00:00:00.000000
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,2025-01-01 00:00:00.000000


In [15]:
raw_data_with_create_timestamp = raw_data.copy()  
raw_data_with_create_timestamp['created_timestamp'] = pd.to_datetime('2025-01-01 12:00:00.000')

In [16]:
raw_data_with_create_timestamp.head()

Unnamed: 0,id,age,sex,dataset,cp,trestbps,chol,fbs,restecg,thalch,exang,oldpeak,slope,ca,thal,num,created_timestamp
0,1,63,Male,Cleveland,typical angina,145.0,233.0,True,lv hypertrophy,150.0,False,2.3,downsloping,0.0,fixed defect,0,2025-01-01 12:00:00
1,2,67,Male,Cleveland,asymptomatic,160.0,286.0,False,lv hypertrophy,108.0,True,1.5,flat,3.0,normal,2,2025-01-01 12:00:00
2,3,67,Male,Cleveland,asymptomatic,120.0,229.0,False,lv hypertrophy,129.0,True,2.6,flat,2.0,reversable defect,1,2025-01-01 12:00:00
3,4,37,Male,Cleveland,non-anginal,130.0,250.0,False,normal,187.0,False,3.5,downsloping,0.0,normal,0,2025-01-01 12:00:00
4,5,41,Female,Cleveland,atypical angina,130.0,204.0,False,lv hypertrophy,172.0,False,1.4,upsloping,0.0,normal,0,2025-01-01 12:00:00


In [6]:
def generate_random_row(record_id):
    age = np.random.randint(29, 77)  
    sex = np.random.choice(['Male', 'Female'])  
    dataset = np.random.choice(['Cleveland', 'Hungary', 'Switzerland', 'VA Long Beach'])  
    cp = np.random.choice(['typical angina', 'atypical angina', 'non-anginal', 'asymptomatic'])
    trestbps = np.random.randint(94, 200)  
    chol = np.random.randint(126, 564)  
    fbs = np.random.choice(['False', 'True'])  
    restecg = np.random.choice(['normal', 'stt abnormality', 'lv hypertrophy'])
    thalach = np.random.randint(71, 202)  
    exang = np.random.choice([0, 1])  
    oldpeak = np.random.uniform(0, 6)  
    slope = np.random.choice([0, 1, 2])  
    ca = np.random.choice([0, 1, 2, 3, 4])  
    thal = np.random.choice(['normal', 'fixed defect', 'reversible defect'])
    target = np.random.choice([0, 1])  
    created_timestamp = datetime.now()  
    return [record_id, age, sex, dataset, cp, trestbps, chol, fbs, restecg, thalach, exang, oldpeak, slope, ca, thal, target, created_timestamp]

new_records = []
for record_id in range(len(raw_data)+1, len(raw_data)+11):  
    new_row = generate_random_row(record_id)  
    new_records.append(new_row)

In [7]:
new_df = pd.DataFrame(new_records, columns=raw_data_with_create_timestamp.columns)

In [8]:
raw_data_with_create_timestamp = raw_data_with_create_timestamp.astype(str)
new_df = new_df.astype(str)

In [9]:
combined_data = pd.concat([raw_data_with_create_timestamp, new_df], ignore_index=True)

In [10]:
timestamp = datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
file_name = f'heart_disease_uci_delta_load_{timestamp}.parquet'

In [27]:
combined_data.to_parquet(f'./dataset/delta/{file_name}', index=False)

In [32]:
watermark_df = pd.DataFrame({"deltawatermark": [datetime.now().strftime('%Y-%m-%d %H:%M:%S')]})
watermark_df.to_parquet("./dataset/delta_watermark_timestamp/watermark.parquet", index=False)