# Database Connection and Initial Data Load  

This notebook is part of the data analysis workflow. Its objective is to connect to the database and extract raw data for further processing.

To begin, the necessary libraries are imported to ensure the correct functioning of the code.

In [1]:
import os
import json
import pandas as pd
from sqlalchemy import create_engine, text
from sqlalchemy.exc import OperationalError

## Connection Configuration  
The credentials are read and extracted to establish a connection to PostgreSQL. Then, the connection to the database stored in PostgreSQL is established.

In [2]:
try:
    os.chdir("../../Leukemia-Cancer-Risk-ETL")
except FileNotFoundError:
    print("""
        FileNotFoundError - The directory may not exist or you are not located in the specified path.
        """)
os.chdir("..")
print(os.getcwd())

/home/ubuntu/Escritorio


In [3]:
with open("Leukemia-Cancer-Risk-ETL/credentials.json", "r", encoding = "utf-8") as f:
    credentials = json.load(f)  

db_host = credentials["db_host"]
db_name = credentials["db_name"]
db_user = credentials["db_user"]
db_password = credentials["db_password"]

default_engine = create_engine(f"postgresql://{db_user}:{db_password}@{db_host}:5432/postgres", isolation_level = "AUTOCOMMIT")

In [4]:
try:
    with default_engine.connect() as connection:
        result = connection.execute(text("SELECT 1 FROM pg_database WHERE datname=:dbname"), {"dbname": db_name})
        exists = result.fetchone()
        if not exists:
            connection.execute(text(f"CREATE DATABASE {db_name}"))
            print("Database created")
        else:
            print("Database already exists")
except OperationalError as e:
    print(f"PostgreSQL connection failed: {e}")

Database already exists


In [5]:
with open("Leukemia-Cancer-Risk-ETL/credentials.json", "r", encoding = "utf-8") as f:
    credentials = json.load(f)  

db_host = credentials["db_host"]
db_name = credentials["db_name"]
db_user = credentials["db_user"]
db_password = credentials["db_password"]

engine = create_engine(f"postgresql+psycopg2://{db_user}:{db_password}@{db_host}:5432/{db_name}")

In [6]:
leukemia_raw_data = pd.read_csv("Leukemia-Cancer-Risk-ETL/data/biased_leukemia_dataset.csv", sep=',', encoding='utf-8')

try:
    with engine.connect() as connection:
        leukemia_raw_data.to_sql('leukemia_raw_data', con = connection , if_exists='replace', index=False)
        print("Datos insertados correctamente en la tabla 'leukemia_raw_data'.")
except Exception as e:
    print(f"Error al insertar datos: {e}")


Error al insertar datos: 'Connection' object has no attribute 'cursor'


  leukemia_raw_data.to_sql('leukemia_raw_data', con = connection , if_exists='replace', index=False)
