In [1]:
# ETL Pipeline for Liver Patient Dataset

# -----------------------
# Step 1: Extract
# -----------------------
import pandas as pd
import sqlite3
from sklearn.preprocessing import LabelEncoder

# Path to your CSV
csv_path = r"C:\Users\higas\Downloads\gen-ai-medical-data-etl\Liver_Patient_Dataset_ILPD.csv"

# Load CSV into pandas DataFrame
df = pd.read_csv(csv_path)

# Quick look at the data
print(df.head())
print(df.info())

# -----------------------
# Step 2: Transform
# -----------------------
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns
print(f"Categorical columns: {list(categorical_cols)}")

# Apply Label Encoding to categorical columns
le = LabelEncoder()
for col in categorical_cols:
    df[col] = le.fit_transform(df[col])

# Verify transformation
print(df.head())
print(df.info())

# -----------------------
# Step 3: Load
# -----------------------
# Connect to SQLite database (it will create it if it doesn't exist)
conn = sqlite3.connect("Patient_record.db")
cursor = conn.cursor()

# Load the transformed data into SQL table
table_name = "Liver_patients"
df.to_sql(table_name, conn, if_exists='replace', index=False)

# Verify by reading the table back
df_sql = pd.read_sql(f"SELECT * FROM {table_name} LIMIT 5;", conn)
print(df_sql)

# Close the connection
conn.close()


   65 Female   0.7  0.1  187  16   18  6.8  3.3   0.9  1
0  62   Male  10.9  5.5  699  64  100  7.5  3.2  0.74  1
1  62   Male   7.3  4.1  490  60   68  7.0  3.3  0.89  1
2  58   Male   1.0  0.4  182  14   20  6.8  3.4  1.00  1
3  72   Male   3.9  2.0  195  27   59  7.3  2.4  0.40  1
4  46   Male   1.8  0.7  208  19   14  7.6  4.4  1.30  1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 582 entries, 0 to 581
Data columns (total 11 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   65      582 non-null    int64  
 1   Female  582 non-null    object 
 2   0.7     582 non-null    float64
 3   0.1     582 non-null    float64
 4   187     582 non-null    int64  
 5   16      582 non-null    int64  
 6   18      582 non-null    int64  
 7   6.8     582 non-null    float64
 8   3.3     582 non-null    float64
 9   0.9     578 non-null    float64
 10  1       582 non-null    int64  
dtypes: float64(5), int64(5), object(1)
memory usage: 50.1+ KB
None
Cate