In [1]:
import pandas as pd
import zipfile
import os
import yaml

def preprocess():
    with open("config/config.yaml", "r") as f:
        config = yaml.safe_load(f)
    
    zip_path = os.path.join(config['data']['target_dir'], config['data']['dataset_name'])
    extract_dir = "data/raw"
    
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    
    # Example: Loading the Cleveland dataset (most common)
    df = pd.read_csv(f"{extract_dir}/processed.cleveland.data", header=None)
    # Define columns based on UCI metadata
    df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target'] # (num) is the (target) column
    
    # Cleaning: Handle missing values and binary target
    df = df.replace('?', pd.NA).dropna()
    df['target'] = (df['target'] > 0).astype(int) 
    
    os.makedirs("data/processed", exist_ok=True)
    df.to_csv("data/processed/cleaned_heart_disease.csv", index=False)

# if __name__ == "__main__":
#     preprocess()

In [10]:
os.chdir(os.path.dirname(os.getcwd()))
os.getcwd()

'/home/shafi/BITS_SSH_KEYS/MLOps-Ass-01'

In [11]:
with open("config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [12]:
config

{'project': {'repo_name': 'MLOps-Ass-01', 'remote_name': 'origin'},
 'data': {'url': 'https://archive.ics.uci.edu/static/public/45/heart+disease.zip',
  'dataset_name': 'heart_disease.zip',
  'target_dir': 'data/'}}

In [13]:
zip_path = os.path.join(config['data']['target_dir'], config['data']['dataset_name'])
extract_dir = "data/raw"

In [14]:
zip_path

'data/heart_disease.zip'

In [15]:
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

In [16]:
!ls -la data/raw

total 692
drwxr-xr-x. 1 shafi shafi    492 Dec 25 12:14 .
drwxr-xr-x. 1 shafi shafi    102 Dec 25 12:14 ..
-rw-r--r--. 1 shafi shafi    587 Dec 25 12:14 ask-detrano
-rw-r--r--. 1 shafi shafi   6737 Dec 25 12:14 bak
-rw-r--r--. 1 shafi shafi  60669 Dec 25 12:14 cleveland.data
-rw-r--r--. 1 shafi shafi  23941 Dec 25 12:14 cleve.mod
drwxr-xr-x. 1 shafi shafi    204 Dec 25 12:14 costs
-rw-r--r--. 1 shafi shafi  10060 Dec 25 12:14 heart-disease.names
-rw-r--r--. 1 shafi shafi  62192 Dec 25 12:14 hungarian.data
-rw-r--r--. 1 shafi shafi    644 Dec 25 12:14 Index
-rw-r--r--. 1 shafi shafi  39892 Dec 25 12:14 long-beach-va.data
-rw-r--r--. 1 shafi shafi 389771 Dec 25 12:14 new.data
-rw-r--r--. 1 shafi shafi  18461 Dec 25 12:14 processed.cleveland.data
-rw-r--r--. 1 shafi shafi  10263 Dec 25 12:14 processed.hungarian.data
-rw-r--r--. 1 shafi shafi   4109 Dec 25 12:14 processed.switzerland.data
-rw-r--r--. 1 shafi shafi   6737 Dec 25 12:14 processed.va.data
-rw-r--r--. 1 shafi shafi  11058 Dec 2

In [22]:
df = pd.read_csv(f"{extract_dir}/processed.cleveland.data", header=None)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [25]:
import numpy as np

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       303 non-null    float64
 1   1       303 non-null    float64
 2   2       303 non-null    float64
 3   3       303 non-null    float64
 4   4       303 non-null    float64
 5   5       303 non-null    float64
 6   6       303 non-null    float64
 7   7       303 non-null    float64
 8   8       303 non-null    float64
 9   9       303 non-null    float64
 10  10      303 non-null    float64
 11  11      303 non-null    object 
 12  12      303 non-null    object 
 13  13      303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [29]:
df2 = df.dropna()

In [30]:
df == df2

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,True,True,True,True,True,True,True,True,True,True,True,True,True,True
299,True,True,True,True,True,True,True,True,True,True,True,True,True,True
300,True,True,True,True,True,True,True,True,True,True,True,True,True,True
301,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [31]:
df.columns = ['age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg', 
                  'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal', 'target']

In [32]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [33]:
os.makedirs("data/processed", exist_ok=True)
df.to_csv("data/processed/cleaned_heart_disease.csv", index=False)