In [1]:
import pandas as pd
from sqlalchemy import create_engine

# Database connection string
engine = create_engine('postgresql://uvg_user:uvg_password@db:5432/health_data')

# Simple query to test the foundation layer
try:
    df = pd.read_sql("SELECT 1 as connection_status", engine)
    print("Connection Successful! Your Biomedical Data Stack is ready.")
    print(df)
except Exception as e:
    print(f"Connection Failed: {e}")

Connection Successful! Your Biomedical Data Stack is ready.
   connection_status
0                  1


In [7]:
from sqlalchemy import text

# Código SQL completo del Lab anterior (Estructura + Datos)
setup_sql = """
-- 1. Limpiar todo por si acaso (para empezar de cero)
DROP TABLE IF EXISTS labevents CASCADE;
DROP TABLE IF EXISTS d_labitems CASCADE;
DROP TABLE IF EXISTS diagnoses CASCADE;
DROP TABLE IF EXISTS admissions CASCADE;
DROP TABLE IF EXISTS patients CASCADE;

-- 2. Crear Tablas (Estructura)
CREATE TABLE patients (
  subject_id SERIAL PRIMARY KEY,
  external_id TEXT UNIQUE,
  full_name TEXT,
  sex CHAR(1) CHECK (sex IN ('M','F','O')),
  date_of_birth DATE, -- Usamos este nombre para compatibilidad
  date_of_death DATE
);

CREATE TABLE admissions (
  hadm_id SERIAL PRIMARY KEY,
  subject_id INT REFERENCES patients(subject_id),
  admittime TIMESTAMP,
  dischtime TIMESTAMP,
  admission_type TEXT,
  hospital_expire_flag BOOLEAN
);

CREATE TABLE diagnoses (
  diagnosis_id SERIAL PRIMARY KEY,
  hadm_id INT REFERENCES admissions(hadm_id),
  diagnosis_text TEXT
);

CREATE TABLE d_labitems (
  labitem_id SERIAL PRIMARY KEY,
  label TEXT,
  unit TEXT
);

CREATE TABLE labevents (
  labevent_id SERIAL PRIMARY KEY,
  hadm_id INT REFERENCES admissions(hadm_id),
  labitem_id INT REFERENCES d_labitems(labitem_id),
  charttime TIMESTAMP,
  value_num NUMERIC
);

-- 3. Insertar Datos de Ejemplo (Pacientes, Admisiones, Labs)
INSERT INTO patients (external_id, full_name, sex, date_of_birth) VALUES
('MRN-0001', 'Ana López',       'F', '1980-03-12'),
('MRN-0002', 'Carlos Pérez',    'M', '1975-07-01'),
('MRN-0003', 'María Gómez',     'F', '1992-11-23'),
('MRN-0004', 'José Martínez',   'M', '1968-05-09'),
('MRN-0005', 'Alex Rivera',     'O', '2001-08-14'),
('MRN-0006', 'Lucía Herrera',   'F', '1988-02-02'),
('MRN-0007', 'Miguel Castillo', 'M', '1959-10-30'),
('MRN-0008', 'Sofía Morales',   'F', '1979-06-18');

INSERT INTO admissions (subject_id, admittime, dischtime, admission_type, hospital_expire_flag) VALUES
(1, '2101-01-10 08:00', '2101-01-15 14:00', 'Emergency', false),
(1, '2102-06-01 10:00', '2102-06-05 09:00', 'Elective',  false),
(2, '2101-03-20 22:00', '2101-03-28 10:00', 'Emergency', true),
(3, '2101-07-11 13:00', '2101-07-14 11:00', 'Urgent',    false),
(4, '2101-09-02 06:00', '2101-09-10 15:00', 'Emergency', false),
(5, '2101-12-18 19:00', '2101-12-22 08:00', 'Emergency', false),
(6, '2101-02-05 09:30', '2101-02-08 10:00', 'Emergency', false),
(6, '2101-11-01 16:00', '2101-11-04 12:00', 'Urgent',    false),
(7, '2101-05-14 07:00', '2101-05-20 15:00', 'Emergency', false),
(8, '2101-08-21 20:00', '2101-08-24 09:00', 'Emergency', false);

INSERT INTO diagnoses (hadm_id, diagnosis_text) VALUES
(1,  'Hypertension'), (1,  'Chest pain'),
(2,  'Elective procedure follow-up'),
(3,  'Sepsis'), (3,  'Acute Kidney Injury'),
(4,  'Asthma exacerbation'),
(5,  'Pneumonia'),
(6,  'Trauma'),
(7,  'Urinary tract infection'),
(8,  'Dehydration'),
(9,  'Heart failure'),
(10, 'Appendicitis');

INSERT INTO d_labitems (label, unit) VALUES
('Creatinine', 'mg/dL'),
('Hemoglobin', 'g/dL'),
('White Blood Cells', '10^9/L'),
('Platelets', '10^9/L'),
('Lactate', 'mmol/L');

INSERT INTO labevents (hadm_id, labitem_id, charttime, value_num) VALUES
(1, 1, '2101-01-11 06:00', 1.1), (1, 1, '2101-01-13 06:00', 1.6),
(1, 2, '2101-01-11 06:00', 13.2), (1, 3, '2101-01-11 06:00',  7.8),
(1, 4, '2101-01-11 06:00', 230),
(2, 2, '2102-06-02 07:00', 12.9), (2, 3, '2102-06-02 07:00',  6.2),
(2, 4, '2102-06-02 07:00', 210),
(3, 5, '2101-03-21 06:30', 3.8), (3, 3, '2101-03-21 06:30', 18.4),
(3, 1, '2101-03-21 07:00', 2.5), (3, 1, '2101-03-23 07:00', 3.1),
(3, 2, '2101-03-21 07:00', 10.4), (3, 4, '2101-03-21 07:00', 120),
(4, 2, '2101-07-12 08:00', 11.5), (4, 3, '2101-07-12 08:00',  9.1),
(4, 4, '2101-07-12 08:00', 250),
(5, 3, '2101-09-03 06:30', 14.2), (5, 2, '2101-09-03 06:30', 12.1),
(5, 5, '2101-09-03 06:30', 2.2),
(6, 2, '2101-12-19 07:00', 10.8), (6, 3, '2101-12-19 07:00', 12.5),
(6, 4, '2101-12-19 07:00', 180),
(7, 1, '2101-02-06 06:00', 0.9), (7, 3, '2101-02-06 06:00', 11.0),
(8, 1, '2101-11-02 06:00', 1.4), (8, 1, '2101-11-03 06:00', 1.1),
(8, 3, '2101-11-02 06:00',  8.4),
(9, 1, '2101-05-15 07:00', 1.8), (9, 2, '2101-05-15 07:00', 12.7),
(9, 3, '2101-05-15 07:00', 10.2), (9, 4, '2101-05-15 07:00', 160),
(10, 3, '2101-08-22 06:00', 13.0), (10, 2, '2101-08-22 06:00', 12.9);
"""

# Ejecutar en la base de datos
with engine.connect() as conn:
    conn.execute(text(setup_sql))
    conn.commit()
    print("✅ Base de datos reconstruida exitosamente. ¡Ya puedes seguir!")

✅ Base de datos reconstruida exitosamente. ¡Ya puedes seguir!


In [10]:
from sqlalchemy import text

# Código para ensuciar los datos (Veracity / Lab 1.1)
veracity_sql = """
-- A) 3 pacientes "sucios" (missing + typo)
INSERT INTO patients (external_id, full_name, sex, date_of_birth) VALUES
('MRN-9001', 'Paciente Sin Fecha', 'F', NULL),            -- Missing DOB
('MRN-9002', 'Paciente Sin Nombre', 'M', '1990-01-01'),   -- Nombre vacío (lo simulamos abajo)
('MRN-9003', 'Paciente Ciudad Typo', 'M', '1985-09-10');  -- Ciudad con typo

-- B) Actualizamos a NULL el nombre de MRN-9002
UPDATE patients
SET full_name = NULL
WHERE external_id = 'MRN-9002';

-- C) Creamos la admisión y el diagnóstico con error de dedo
INSERT INTO admissions (subject_id, admittime, dischtime, admission_type, hospital_expire_flag)
SELECT subject_id, '2101-10-01 08:00', '2101-10-02 12:00', 'Emergency', false
FROM patients
WHERE external_id = 'MRN-9003';

INSERT INTO diagnoses (hadm_id, diagnosis_text)
SELECT a.hadm_id, 'Guateeeemala referral note'
FROM admissions a
JOIN patients p ON p.subject_id = a.subject_id
WHERE p.external_id = 'MRN-9003'
ORDER BY a.hadm_id DESC
LIMIT 1;
"""

with engine.connect() as conn:
    conn.execute(text(veracity_sql))
    conn.commit()
    print("✅ Datos 'sucios' insertados correctamente (NULLs y Typos listos).")

✅ Datos 'sucios' insertados correctamente (NULLs y Typos listos).


In [11]:
# Query 8.2: Contar missing date_of_birth
q_8_2 = """
SELECT COUNT(*) AS pacientes_sin_fecha_nacimiento
FROM patients
WHERE date_of_birth IS NULL;
"""
pd.read_sql(q_8_2, engine)


Unnamed: 0,pacientes_sin_fecha_nacimiento
0,1


In [12]:
# Query 8.3: Contar missing full_name
q_missing_name = """
SELECT COUNT(*) AS total_missing_name
FROM patients
WHERE full_name IS NULL;
"""
pd.read_sql(q_missing_name, engine)

Unnamed: 0,total_missing_name
0,1


In [15]:
from sqlalchemy import text  # <--- Importante agregar esto

# Query 8.4: Buscar typos
q_8_4 = """
SELECT hadm_id, diagnosis_text
FROM diagnoses
WHERE diagnosis_text LIKE '%Guate%';
"""

# Usamos text() para que Python respete los signos de %
pd.read_sql(text(q_8_4), engine)


Unnamed: 0,hadm_id,diagnosis_text
0,11,Guateeeemala referral note
