In [1]:
import os
import pandas as pd
import dlt
import requests
from dotenv import load_dotenv
from urllib.parse import unquote, urlparse
from dlt.sources.filesystem import filesystem
import zipfile
import uuid

load_dotenv()

True

In [2]:
FILEPATH = rf'https://data.poltekkes-smg.ac.id/dataset/7627060d-81f1-4393-9ec0-c49740efbf6a/resource/c759b693-ac63-431b-aca8-10bee206188d/download/archive-1.zip'

parsed_url = urlparse(FILEPATH)
filename = unquote(os.path.basename(parsed_url.path))

In [3]:
INPUT_DATA_DIR = os.getenv('INPUT_DATA_DIR')

In [4]:
if not INPUT_DATA_DIR:
    raise ValueError("INPUT_DATA_DIR не указан в .env файле")

In [5]:
if not os.path.exists(INPUT_DATA_DIR):
    os.makedirs(INPUT_DATA_DIR)

In [6]:
DOWNLOAD_PATH = os.path.join(INPUT_DATA_DIR, filename)

In [7]:
FILENAME = 'Breast_Cancer.csv'

In [8]:
response = requests.get(FILEPATH)
if response.status_code == 200:
    with open(DOWNLOAD_PATH, 'wb') as file:
        for chunk in response.iter_content(chunk_size=8192):
            file.write(chunk)
    print(f"File successfully saved here: {DOWNLOAD_PATH}")
    
    with zipfile.ZipFile(DOWNLOAD_PATH, 'r') as zip_ref:
        zip_ref.extractall(INPUT_DATA_DIR)
    print(f"File successfully unpacked: {INPUT_DATA_DIR}")
else:
    print(f"Error during download. Status code: {response.status_code}")

File successfully saved here: /home/timosii/data/de_zoomcamp_project/archive-1.zip
File successfully unpacked: /home/timosii/data/de_zoomcamp_project/


In [9]:
df = pd.read_csv(os.path.join(INPUT_DATA_DIR, FILENAME))

In [10]:
df.insert(0, 'ID', [str(uuid.uuid4()) for _ in range(len(df))])

In [11]:
df['load_date'] = pd.Timestamp('now').round(freq='S')

  df['load_date'] = pd.Timestamp('now').round(freq='S')


In [12]:
df.columns = [col.strip().replace(' ', '_') for col in df.columns]

In [13]:
for col in [
    'Race',
    'Marital_Status',
    'T_Stage',
    'N_Stage',
    '6th_Stage',
    'differentiate',
    'Grade',
    'A_Stage',
    'Estrogen_Status',
    'Progesterone_Status',
    'Status',
    ]:
    df[col] = df[col].astype('category')

In [14]:
df.head(1)

Unnamed: 0,ID,Age,Race,Marital_Status,T_Stage,N_Stage,6th_Stage,differentiate,Grade,A_Stage,Tumor_Size,Estrogen_Status,Progesterone_Status,Regional_Node_Examined,Reginol_Node_Positive,Survival_Months,Status,load_date
0,fe42612d-a018-43d5-b8b9-a08ab9e072a4,68,White,Married,T1,N1,IIA,Poorly differentiated,3,Regional,4,Positive,Positive,24,1,60,Alive,2025-03-22 16:04:29


In [15]:
pipeline = dlt.pipeline(
    pipeline_name='breast_cancer_from_df',
    destination='postgres',
    dataset_name='public'
)

In [16]:
pipeline.run(
    data=df,
    table_name='breast_cancer',
    write_disposition='append'
    )

LoadInfo(pipeline=<dlt.pipeline.pipeline.Pipeline object at 0x7fe04130a500>, metrics={'1742648672.0092244': [{'started_at': DateTime(2025, 3, 22, 13, 4, 32, 148620, tzinfo=Timezone('UTC')), 'finished_at': DateTime(2025, 3, 22, 13, 4, 32, 218418, tzinfo=Timezone('UTC')), 'job_metrics': {'_dlt_pipeline_state.d3ed173deb.insert_values': LoadJobMetrics(job_id='_dlt_pipeline_state.d3ed173deb.insert_values', file_path='/home/timosii/.dlt/pipelines/breast_cancer_from_df/load/normalized/1742648672.0092244/started_jobs/_dlt_pipeline_state.d3ed173deb.0.insert_values', table_name='_dlt_pipeline_state', started_at=DateTime(2025, 3, 22, 13, 4, 32, 184001, tzinfo=Timezone('UTC')), finished_at=DateTime(2025, 3, 22, 13, 4, 32, 187111, tzinfo=Timezone('UTC')), state='completed', remote_url=None), 'breast_cancer.950d14cf21.csv': LoadJobMetrics(job_id='breast_cancer.950d14cf21.csv', file_path='/home/timosii/.dlt/pipelines/breast_cancer_from_df/load/normalized/1742648672.0092244/started_jobs/breast_cancer.