# ETL: Limpieza de Datos de Viviendas en Barcelona

## Importamos las librer√≠as necesarias


In [1]:
import pandas as pd
import numpy as np
import re

print("‚úÖ Todas las librer√≠as est√°n importadas correctamente")


‚úÖ Todas las librer√≠as est√°n importadas correctamente


## EXTRACT: Cargar los datos


In [2]:
# Cargar el CSV
df_raw = pd.read_csv("../data/housing-barcelona.csv")

print("‚úÖ El dataframe se ha creado correctamente")
print(f"Shape: {df_raw.shape}")
print(f"\nColumnas: {list(df_raw.columns)}")
print(f"\nPrimeras filas:")
df_raw.head()


‚úÖ El dataframe se ha creado correctamente
Shape: (10000, 20)

Columnas: ['listing_id', 'operation', 'district', 'neighborhood', 'address', 'surface_m2', 'rooms', 'bathrooms', 'price_eur', 'price_per_m2', 'floor', 'elevator', 'balcony', 'furnished', 'condition', 'energy_certificate', 'has_parking', 'latitude', 'longitude', 'agency']

Primeras filas:


Unnamed: 0,listing_id,operation,district,neighborhood,address,surface_m2,rooms,bathrooms,price_eur,price_per_m2,floor,elevator,balcony,furnished,condition,energy_certificate,has_parking,latitude,longitude,agency
0,ID_0,alquiler,Unknown,Sagrada Fam√≠lia,C/ Arag√≥ 395,89 m¬≤,?,2,?,4240 ‚Ç¨/m2,1¬∫,Y,No,partially,average,?,No,,?,Particular
1,,VENDER,Eixampl,Les Corts,Passeig de Gr√†cia,171,,1,?,7920.91,√°tico,?,N,,?,D,No,,?,Housfy
2,ID_2,lease,Sant Mart√≠,El Clot,C/ Mallorca 316,?,2+,?,317642 ‚Ç¨,?,2¬∫,Y,?,?,average,D,?,41.3997,?,Engel & V√∂lkers
3,,alquiler,SANTS,Sagrada Fam√≠lia,Calle Falsa 123,,three,two,,5484 ‚Ç¨/m2,s√≥tano,N,S√≠,?,a reformar,A,Y,,2.0,Engel & V√∂lkers
4,5,buy,SANTS,Les Corts,C/ Gran Via 245,?,2+,?,,?,4¬∫,S√≠,N,S√≠,average,F,Y,?,2.0,Particular


## EXPLORACI√ìN INICIAL DEL DATASET

### An√°lisis descriptivo y detecci√≥n de problemas


In [3]:
# Informaci√≥n general del dataset
print("=== INFORMACI√ìN GENERAL DEL DATASET ===\n")
print(f"Dimensiones: {df_raw.shape[0]} filas √ó {df_raw.shape[1]} columnas")
print(f"\nTipos de datos originales:")
print(df_raw.dtypes)
print(f"\nMemoria utilizada: {df_raw.memory_usage(deep=True).sum() / 1024**2:.2f} MB")


=== INFORMACI√ìN GENERAL DEL DATASET ===

Dimensiones: 10000 filas √ó 20 columnas

Tipos de datos originales:
listing_id            object
operation             object
district              object
neighborhood          object
address               object
surface_m2            object
rooms                 object
bathrooms             object
price_eur             object
price_per_m2          object
floor                 object
elevator              object
balcony               object
furnished             object
condition             object
energy_certificate    object
has_parking           object
latitude              object
longitude             object
agency                object
dtype: object

Memoria utilizada: 11.96 MB


In [4]:
# An√°lisis de valores faltantes
print("=== AN√ÅLISIS DE VALORES FALTANTES ===\n")
missing_data = df_raw.isnull().sum()
missing_percent = (missing_data / len(df_raw)) * 100
missing_df = pd.DataFrame({
    'Valores Faltantes': missing_data,
    'Porcentaje (%)': missing_percent.round(2)
})
missing_df = missing_df[missing_df['Valores Faltantes'] > 0].sort_values('Valores Faltantes', ascending=False)
print(missing_df)
print(f"\nTotal de valores faltantes: {df_raw.isnull().sum().sum()}")
print(f"Porcentaje total de datos faltantes: {(df_raw.isnull().sum().sum() / (df_raw.shape[0] * df_raw.shape[1])) * 100:.2f}%")


=== AN√ÅLISIS DE VALORES FALTANTES ===

                    Valores Faltantes  Porcentaje (%)
listing_id                       3321           33.21
latitude                         2531           25.31
price_per_m2                     2508           25.08
longitude                        2467           24.67
rooms                            1993           19.93
surface_m2                       1978           19.78
furnished                        1974           19.74
bathrooms                        1939           19.39
price_eur                        1652           16.52
floor                            1277           12.77
condition                        1262           12.62
energy_certificate               1085           10.85

Total de valores faltantes: 23987
Porcentaje total de datos faltantes: 11.99%


In [5]:
# Estad√≠sticas descriptivas de columnas num√©ricas
print("=== ESTAD√çSTICAS DESCRIPTIVAS (Columnas Num√©ricas) ===\n")
numeric_cols = df_raw.select_dtypes(include=[np.number]).columns
if len(numeric_cols) > 0:
    print(df_raw[numeric_cols].describe())
else:
    print("No hay columnas num√©ricas detectadas (todas son strings)")


=== ESTAD√çSTICAS DESCRIPTIVAS (Columnas Num√©ricas) ===

No hay columnas num√©ricas detectadas (todas son strings)


In [6]:
# An√°lisis de valores √∫nicos y duplicados
print("=== AN√ÅLISIS DE VALORES √öNICOS Y DUPLICADOS ===\n")
print(f"Filas duplicadas: {df_raw.duplicated().sum()}")
print(f"\nValores √∫nicos por columna:")
for col in df_raw.columns:
    unique_count = df_raw[col].nunique()
    print(f"  {col}: {unique_count} valores √∫nicos")


=== AN√ÅLISIS DE VALORES √öNICOS Y DUPLICADOS ===

Filas duplicadas: 0

Valores √∫nicos por columna:
  listing_id: 6679 valores √∫nicos
  operation: 7 valores √∫nicos
  district: 14 valores √∫nicos
  neighborhood: 14 valores √∫nicos
  address: 1148 valores √∫nicos
  surface_m2: 404 valores √∫nicos
  rooms: 9 valores √∫nicos
  bathrooms: 6 valores √∫nicos
  price_eur: 4643 valores √∫nicos
  price_per_m2: 4426 valores √∫nicos
  floor: 7 valores √∫nicos
  elevator: 8 valores √∫nicos
  balcony: 8 valores √∫nicos
  furnished: 4 valores √∫nicos
  condition: 7 valores √∫nicos
  energy_certificate: 8 valores √∫nicos
  has_parking: 8 valores √∫nicos
  latitude: 2107 valores √∫nicos
  longitude: 2361 valores √∫nicos
  agency: 7 valores √∫nicos


In [7]:
# Detecci√≥n de valores problem√°ticos (espacios, caracteres especiales, etc.)
print("=== DETECCI√ìN DE VALORES PROBLEM√ÅTICOS ===\n")
valores_problematicos = ['?', 'N/A', 'n/a', 'NULL', 'null', 'unknown', 'Unknown', '']

for col in df_raw.select_dtypes(include=['object']).columns:
    problematic_count = df_raw[col].isin(valores_problematicos).sum()
    if problematic_count > 0:
        print(f"{col}: {problematic_count} valores problem√°ticos detectados")
        
# Verificar espacios en blanco al inicio/final
print("\nVerificando espacios en blanco:")
for col in df_raw.select_dtypes(include=['object']).columns:
    if df_raw[col].dtype == 'object':
        has_leading_trailing_spaces = df_raw[col].astype(str).str.strip().ne(df_raw[col].astype(str)).sum()
        if has_leading_trailing_spaces > 0:
            print(f"  {col}: {has_leading_trailing_spaces} valores con espacios al inicio/final")


=== DETECCI√ìN DE VALORES PROBLEM√ÅTICOS ===

operation: 1399 valores problem√°ticos detectados
district: 686 valores problem√°ticos detectados
neighborhood: 1425 valores problem√°ticos detectados
address: 1966 valores problem√°ticos detectados
surface_m2: 2005 valores problem√°ticos detectados
rooms: 2011 valores problem√°ticos detectados
bathrooms: 2053 valores problem√°ticos detectados
price_eur: 1695 valores problem√°ticos detectados
price_per_m2: 2533 valores problem√°ticos detectados
elevator: 2451 valores problem√°ticos detectados
balcony: 2579 valores problem√°ticos detectados
furnished: 1933 valores problem√°ticos detectados
condition: 1271 valores problem√°ticos detectados
energy_certificate: 1149 valores problem√°ticos detectados
has_parking: 2447 valores problem√°ticos detectados
latitude: 2524 valores problem√°ticos detectados
longitude: 2452 valores problem√°ticos detectados
agency: 1422 valores problem√°ticos detectados

Verificando espacios en blanco:
  neighborhood: 73

## TRANSFORM: Limpieza de Datos

### Paso 1: Crear copia para trabajar


In [8]:
# Crear copia del dataframe
df_clean = df_raw.copy()
print(f"‚úÖ Dataframe copiado. Filas: {len(df_clean)}")


‚úÖ Dataframe copiado. Filas: 10000


### Paso 2: Eliminar espacios (strip) en columnas de texto


In [9]:
# Aplicar strip() a todas las columnas de tipo object (string)
for col in df_clean.select_dtypes(include=['object']).columns:
    df_clean[col] = df_clean[col].astype(str).str.strip()
    # Reemplazar 'nan' string por NaN
    df_clean[col] = df_clean[col].replace('nan', np.nan)

print("‚úÖ Espacios eliminados de todas las columnas de texto")


‚úÖ Espacios eliminados de todas las columnas de texto


### Paso 3: Rellenar valores vac√≠os


In [10]:
# Reemplazar valores que representan "vac√≠o" por NaN
valores_vacios = ['', ' ', 'nan', 'None', 'N/A', 'n/a', 'NULL', 'null', '?', 'unknown']

for col in df_clean.columns:
    df_clean[col] = df_clean[col].replace(valores_vacios, np.nan)

print("‚úÖ Valores vac√≠os convertidos a NaN")
print(f"\nValores NaN por columna:")
print(df_clean.isnull().sum().sort_values(ascending=False))


‚úÖ Valores vac√≠os convertidos a NaN

Valores NaN por columna:
latitude              5055
price_per_m2          5041
longitude             4919
rooms                 4004
bathrooms             3992
surface_m2            3983
furnished             3907
address               3904
price_eur             3347
listing_id            3321
balcony               2579
condition             2533
elevator              2451
has_parking           2447
energy_certificate    2234
neighborhood          1425
agency                1422
operation             1399
floor                 1277
district                 0
dtype: int64


### Paso 4: Convertir tipos de datos adecuados


In [12]:
# Funci√≥n para extraer n√∫meros de strings
def extract_number(value):
    """Extrae el primer n√∫mero de un string"""
    if pd.isna(value):
        return np.nan
    value_str = str(value)
    # Buscar n√∫meros (enteros o decimales)
    numbers = re.findall(r'\d+\.?\d*', value_str)
    if numbers:
        return float(numbers[0])
    return np.nan

# Funci√≥n para convertir texto a n√∫mero (one, two, three, etc.)
def text_to_number(value):
    """Convierte texto a n√∫mero"""
    if pd.isna(value):
        return np.nan
    value_str = str(value).lower().strip()
    
    # Mapear texto a n√∫meros
    text_map = {
        'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5,
        'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10
    }
    
    if value_str in text_map:
        return text_map[value_str]
    
    # Si tiene formato "2+", extraer el n√∫mero
    if '+' in value_str:
        nums = re.findall(r'\d+', value_str)
        if nums:
            return int(nums[0])
    
    # Intentar extraer n√∫mero directamente
    return extract_number(value)

# Limpiar surface_m2 (puede venir como "89 m¬≤", "107m2", etc.)
if 'surface_m2' in df_clean.columns:
    df_clean['surface_m2'] = df_clean['surface_m2'].apply(extract_number)

# Limpiar rooms (puede venir como "three", "2+", etc.)
if 'rooms' in df_clean.columns:
    df_clean['rooms'] = df_clean['rooms'].apply(text_to_number)

# Limpiar bathrooms (similar a rooms)
if 'bathrooms' in df_clean.columns:
    df_clean['bathrooms'] = df_clean['bathrooms'].apply(text_to_number)

# Limpiar price_eur (puede venir como "317642 ‚Ç¨", "1.200‚Ç¨", etc.)
if 'price_eur' in df_clean.columns:
    def extract_price(value):
        if pd.isna(value):
            return np.nan
        value_str = str(value).replace('‚Ç¨', '').replace('.', '').replace(',', '.').strip()
        numbers = re.findall(r'\d+\.?\d*', value_str)
        if numbers:
            return float(numbers[0])
        return np.nan
    df_clean['price_eur'] = df_clean['price_eur'].apply(extract_price)

# Limpiar price_per_m2 (puede venir como "4240 ‚Ç¨/m2", etc.)
if 'price_per_m2' in df_clean.columns:
    def extract_price_m2(value):
        if pd.isna(value):
            return np.nan
        value_str = str(value).replace('‚Ç¨/m2', '').replace('‚Ç¨/m¬≤', '').replace('.', '').replace(',', '.').strip()
        numbers = re.findall(r'\d+\.?\d*', value_str)
        if numbers:
            return float(numbers[0])
        return np.nan
    df_clean['price_per_m2'] = df_clean['price_per_m2'].apply(extract_price_m2)

# Convertir coordenadas
if 'latitude' in df_clean.columns:
    df_clean['latitude'] = pd.to_numeric(df_clean['latitude'], errors='coerce')
if 'longitude' in df_clean.columns:
    df_clean['longitude'] = pd.to_numeric(df_clean['longitude'], errors='coerce')

print("‚úÖ Columnas num√©ricas limpiadas y convertidas")
print(f"\nTipos de datos num√©ricos:")
numeric_cols = ['surface_m2', 'rooms', 'bathrooms', 'price_eur', 'price_per_m2', 'latitude', 'longitude']
for col in numeric_cols:
    if col in df_clean.columns:
        print(f"  {col}: {df_clean[col].dtype}")


‚úÖ Columnas num√©ricas limpiadas y convertidas

Tipos de datos num√©ricos:
  surface_m2: float64
  rooms: float64
  bathrooms: float64
  price_eur: float64
  price_per_m2: float64
  latitude: float64
  longitude: float64


In [13]:
# Convertir columnas que deben ser enteros
int_cols = ['rooms', 'bathrooms']

for col in int_cols:
    if col in df_clean.columns:
        # Convertir a int, pero mantener NaN usando Int64
        df_clean[col] = df_clean[col].astype('Int64')  # Int64 permite NaN

print("‚úÖ Columnas convertidas a enteros")


‚úÖ Columnas convertidas a enteros


In [14]:
# Asegurar que las columnas de texto sean string
text_cols = ['listing_id', 'operation', 'district', 'neighborhood', 'address', 
             'floor', 'condition', 'energy_certificate', 'agency']

for col in text_cols:
    if col in df_clean.columns:
        df_clean[col] = df_clean[col].astype(str).replace('nan', np.nan)

print("‚úÖ Columnas de texto convertidas a string")


‚úÖ Columnas de texto convertidas a string


In [16]:
# Convertir columnas booleanas
boolean_cols = ['elevator', 'balcony', 'furnished', 'has_parking']

for col in boolean_cols:
    if col in df_clean.columns:
        # Normalizar valores booleanos
        df_clean[col] = df_clean[col].astype(str).str.lower().str.strip()
        df_clean[col] = df_clean[col].replace({
            'y': True, 'yes': True, 's√≠': True, 'si': True, 's': True, '1': True, 'true': True, 'yes': True,
            'n': False, 'no': False, '0': False, 'false': False
        })
        # Reemplazar valores que no coincidan con NaN
        mask = ~df_clean[col].isin([True, False])
        df_clean.loc[mask, col] = np.nan

print("‚úÖ Columnas booleanas convertidas")


‚úÖ Columnas booleanas convertidas


### Paso 5: Rellenar valores faltantes


In [18]:
# Rellenar valores faltantes
# Para columnas de texto (object): rellenar con "{nombre_columna} empty"
# Para columnas num√©ricas: rellenar con la media

print("=== RELLENANDO VALORES FALTANTES ===\n")

# Identificar columnas de texto (object) y num√©ricas
text_cols = df_clean.select_dtypes(include=['object']).columns
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns

# Rellenar columnas de texto
for col in text_cols:
    if df_clean[col].isnull().sum() > 0:
        fill_value = f"{col} empty"
        df_clean[col] = df_clean[col].fillna(fill_value)
        print(f"‚úÖ {col}: valores rellenados con '{fill_value}'")

# Rellenar columnas num√©ricas con la media
for col in numeric_cols:
    if df_clean[col].isnull().sum() > 0:
        mean_value = df_clean[col].mean()
        
        # Si es Int64, redondear la media a entero
        if df_clean[col].dtype == 'Int64':
            mean_value = int(round(mean_value))
            df_clean[col] = df_clean[col].fillna(mean_value)
            print(f"‚úÖ {col}: valores rellenados con media = {mean_value} (entero)")
        else:
            df_clean[col] = df_clean[col].fillna(mean_value)
            print(f"‚úÖ {col}: valores rellenados con media = {mean_value:.2f}")

print(f"\n‚úÖ Todos los valores faltantes han sido rellenados")
print(f"Valores NaN restantes: {df_clean.isnull().sum().sum()}")


=== RELLENANDO VALORES FALTANTES ===


‚úÖ Todos los valores faltantes han sido rellenados
Valores NaN restantes: 0


## CREACI√ìN DEL DATAWAREHOUSE

### Generaci√≥n de DDLs para las tablas del Datawarehouse


In [19]:
# Generar DDL para la tabla principal del Datawarehouse
def pandas_dtype_to_sql(dtype):
    """Convierte tipos de datos de pandas a tipos SQL"""
    if pd.api.types.is_integer_dtype(dtype):
        return "INTEGER"
    elif pd.api.types.is_float_dtype(dtype):
        return "REAL"
    elif pd.api.types.is_bool_dtype(dtype):
        return "BOOLEAN"
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return "TIMESTAMP"
    else:
        return "TEXT"

# Crear DDL para la tabla principal
ddl_statements = []
ddl_statements.append("-- ============================================")
ddl_statements.append("-- DDL para Datawarehouse - Housing Barcelona")
ddl_statements.append("-- Generado desde ETL con Pandas")
ddl_statements.append("-- ============================================\n")

ddl_statements.append("-- ============================================\n")
ddl_statements.append("-- TABLA DE HECHOS\n")
ddl_statements.append("-- ============================================\n")
ddl_statements.append("-- ============================================\n")
ddl_statements.append("-- TABLAS DIMENSIONALES\n")
ddl_statements.append("-- ============================================\n")
ddl_statements.append("-- Tabla dimensional: dim_district")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_district (")
ddl_statements.append("    district_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    district_name TEXT UNIQUE NOT NULL")
ddl_statements.append(");\n")

ddl_statements.append("-- Tabla dimensional: dim_neighborhood")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_neighborhood (")
ddl_statements.append("    neighborhood_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    neighborhood_name TEXT UNIQUE NOT NULL,")
ddl_statements.append("    district_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (district_id) REFERENCES dim_district(district_id)")
ddl_statements.append(");\n")

ddl_statements.append("-- Tabla dimensional: dim_operation")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_operation (")
ddl_statements.append("    operation_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    operation_type TEXT UNIQUE NOT NULL")
ddl_statements.append(");\n")

ddl_statements.append("-- Tabla dimensional: dim_agency")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_agency (")
ddl_statements.append("    agency_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    agency_name TEXT UNIQUE NOT NULL")
ddl_statements.append(");\n")

ddl_statements.append("-- Tabla dimensional: dim_condition")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_condition (")
ddl_statements.append("    condition_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    condition_type TEXT UNIQUE NOT NULL")
ddl_statements.append(");\n")

ddl_statements.append("-- Tabla dimensional: dim_energy_certificate")
ddl_statements.append("CREATE TABLE IF NOT EXISTS dim_energy_certificate (")
ddl_statements.append("    certificate_id INTEGER PRIMARY KEY AUTOINCREMENT,")
ddl_statements.append("    certificate_type TEXT UNIQUE NOT NULL")
ddl_statements.append(");\n")

# Crear √≠ndices para mejorar el rendimiento
ddl_statements.append("-- √çndices para mejorar el rendimiento de consultas")
ddl_statements.append("CREATE INDEX IF NOT EXISTS idx_fact_price ON fact_housing(price_eur);")
ddl_statements.append("CREATE INDEX IF NOT EXISTS idx_fact_surface ON fact_housing(surface_m2);")

# Unir todas las declaraciones
ddl_sql = "\n".join(ddl_statements)

# Guardar DDL en archivo
import os
import builtins
import stat

# Usar ruta relativa desde el notebook (que est√° en notebooks/)
ddl_file_path = "../warehouse/modelo_datawarehouse_pandas.sql"

# Obtener la ruta absoluta y normalizarla
ddl_file_abs = os.path.abspath(ddl_file_path)
warehouse_dir = os.path.dirname(ddl_file_abs)

# Verificar y crear el directorio warehouse de forma robusta
success = False
try:
    # Verificar si el directorio existe
    if not os.path.exists(warehouse_dir):
        print(f"üìÅ Creando directorio: {warehouse_dir}")
        # Crear el directorio y todos los padres necesarios
        os.makedirs(warehouse_dir, exist_ok=True, mode=0o777)
        print(f"‚úÖ Directorio creado exitosamente")
    else:
        print(f"üìÅ Directorio ya existe: {warehouse_dir}")
    
    # Verificar permisos de escritura
    if not os.access(warehouse_dir, os.W_OK):
        print(f"‚ö†Ô∏è No hay permisos de escritura en: {warehouse_dir}")
        # Intentar cambiar permisos
        try:
            os.chmod(warehouse_dir, 0o777)
            print(f"‚úÖ Permisos actualizados")
        except Exception as perm_error:
            print(f"‚ö†Ô∏è No se pudieron cambiar permisos: {perm_error}")
    
    # Intentar escribir el archivo
    print(f"üíæ Escribiendo archivo en: {ddl_file_abs}")
    with builtins.open(ddl_file_abs, 'w', encoding='utf-8') as f:
        f.write(ddl_sql)
    success = True
    print("‚úÖ DDL del Datawarehouse generado exitosamente")
    print(f"üìÑ Archivo guardado en: {ddl_file_abs}\n")
    
except (FileNotFoundError, PermissionError, OSError) as e:
    print(f"‚ùå Error al escribir en {ddl_file_abs}")
    print(f"   Error: {e}")
    print(f"   Directorio existe: {os.path.exists(warehouse_dir) if warehouse_dir else 'N/A'}")
    print(f"   Permisos de escritura: {os.access(warehouse_dir, os.W_OK) if os.path.exists(warehouse_dir) else 'N/A'}")
    print(f"\nüí° El directorio warehouse/ puede no estar montado correctamente en Docker.")
    print(f"   Verifica la configuraci√≥n del volumen en docker-compose.yml")

if success:
    print("=== DDL GENERADO ===")
    print(ddl_sql)
ddl_statements.append("-- Tabla de hechos: fact_housing")
ddl_statements.append("CREATE TABLE IF NOT EXISTS fact_housing (")
ddl_statements.append("    listing_id TEXT PRIMARY KEY,")
ddl_statements.append("    operation_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (operation_id) REFERENCES dim_operation(operation_id),")
ddl_statements.append("    district_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (district_id) REFERENCES dim_district(district_id),")
ddl_statements.append("    neighborhood_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (neighborhood_id) REFERENCES dim_neighborhood(neighborhood_id),")
ddl_statements.append("    address TEXT,")
ddl_statements.append("    surface_m2 REAL,")
ddl_statements.append("    rooms INTEGER,")
ddl_statements.append("    bathrooms INTEGER,")
ddl_statements.append("    price_eur REAL,")
ddl_statements.append("    price_per_m2 REAL,")
ddl_statements.append("    floor TEXT,")
ddl_statements.append("    elevator BOOLEAN,")
ddl_statements.append("    balcony BOOLEAN,")
ddl_statements.append("    furnished BOOLEAN,")
ddl_statements.append("    condition_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (condition_id) REFERENCES dim_condition(condition_id),")
ddl_statements.append("    energy_certificate_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (energy_certificate_id) REFERENCES dim_energy_certificate(certificate_id),")
ddl_statements.append("    has_parking BOOLEAN,")
ddl_statements.append("    latitude REAL,")
ddl_statements.append("    longitude REAL,")
ddl_statements.append("    agency_id INTEGER,")
ddl_statements.append("    FOREIGN KEY (agency_id) REFERENCES dim_agency(agency_id)")
ddl_statements.append(");\n")



üìÅ Directorio ya existe: /app/warehouse
üíæ Escribiendo archivo en: /app/warehouse/modelo_datawarehouse_pandas.sql
‚úÖ DDL del Datawarehouse generado exitosamente
üìÑ Archivo guardado en: /app/warehouse/modelo_datawarehouse_pandas.sql

=== DDL GENERADO ===
-- DDL para Datawarehouse - Housing Barcelona
-- Generado desde ETL con Pandas


-- TABLA DE HECHOS



-- TABLAS DIMENSIONALES


-- Tabla dimensional: dim_district
CREATE TABLE IF NOT EXISTS dim_district (
    district_id INTEGER PRIMARY KEY AUTOINCREMENT,
    district_name TEXT UNIQUE NOT NULL
);

-- Tabla dimensional: dim_neighborhood
CREATE TABLE IF NOT EXISTS dim_neighborhood (
    neighborhood_id INTEGER PRIMARY KEY AUTOINCREMENT,
    neighborhood_name TEXT UNIQUE NOT NULL,
    district_id INTEGER,
    FOREIGN KEY (district_id) REFERENCES dim_district(district_id)
);

-- Tabla dimensional: dim_operation
CREATE TABLE IF NOT EXISTS dim_operation (
    operation_id INTEGER PRIMARY KEY AUTOINCREMENT,
    operation_type TEXT UNIQ

### Verificaci√≥n: Comparaci√≥n antes/despu√©s


In [21]:
# Mostrar ejemplos de limpieza
print("=== EJEMPLOS DE LIMPIEZA ===\n")
print("ANTES (RAW):")
print(df_raw[['surface_m2', 'rooms', 'bathrooms', 'price_eur', 'price_per_m2', 'elevator', 'district']].head(10))
print("\nDESPU√âS (CLEAN):")
print(df_clean[['surface_m2', 'rooms', 'bathrooms', 'price_eur', 'price_per_m2', 'elevator', 'district']].head(10))


=== EJEMPLOS DE LIMPIEZA ===

ANTES (RAW):
  surface_m2  rooms bathrooms price_eur price_per_m2 elevator  \
0      89 m¬≤      ?         2         ?    4240 ‚Ç¨/m2        Y   
1        171    NaN         1         ?      7920.91        ?   
2          ?     2+         ?  317642 ‚Ç¨            ?        Y   
3        NaN  three       two       NaN    5484 ‚Ç¨/m2        N   
4          ?     2+         ?       NaN            ?       S√≠   
5     127 m¬≤  three         2  491626 ‚Ç¨          NaN        Y   
6          ?     2+       two       NaN            ?        N   
7          ?  three         ?   1282371    4093 ‚Ç¨/m2        Y   
8     127 m¬≤     2+         3         ?       6630.1  unknown   
9        NaN     2+       NaN      4512      7856.74       no   

              district  
0              Unknown  
1              Eixampl  
2           Sant Mart√≠  
3                SANTS  
4                SANTS  
5         Ciutat Vella  
6       Sants-Montju√Øc  
7  Sarri√†-Sant Gervasi  

### Resumen de la transformaci√≥n


In [23]:
print("=== RESUMEN DE LA TRANSFORMACI√ìN ===\n")
print(f"Filas: {len(df_clean)}")
print(f"Columnas: {len(df_clean.columns)}")
print(f"\nTipos de datos:")
print(df_clean.dtypes)
print(f"\nValores faltantes totales: {df_clean.isnull().sum().sum()}")
print(f"\nPrimeras filas del dataset limpio:")
df_clean.head()


=== RESUMEN DE LA TRANSFORMACI√ìN ===

Filas: 10000
Columnas: 20

Tipos de datos:
listing_id             object
operation              object
district               object
neighborhood           object
address                object
surface_m2            float64
rooms                   Int64
bathrooms               Int64
price_eur             float64
price_per_m2          float64
floor                  object
elevator               object
balcony                object
furnished              object
condition              object
energy_certificate     object
has_parking            object
latitude              float64
longitude             float64
agency                 object
dtype: object

Valores faltantes totales: 0

Primeras filas del dataset limpio:


Unnamed: 0,listing_id,operation,district,neighborhood,address,surface_m2,rooms,bathrooms,price_eur,price_per_m2,floor,elevator,balcony,furnished,condition,energy_certificate,has_parking,latitude,longitude,agency
0,ID_0,alquiler,Unknown,Sagrada Fam√≠lia,C/ Arag√≥ 395,89.0,3,2,2633490.0,42400.0,1¬∫,True,False,furnished empty,average,energy_certificate empty,False,41.192377,2.082139,Particular
1,listing_id empty,VENDER,Eixampl,Les Corts,Passeig de Gr√†cia,171.0,3,1,2633490.0,7920910.0,√°tico,elevator empty,False,furnished empty,condition empty,D,False,41.192377,2.082139,Housfy
2,ID_2,lease,Sant Mart√≠,El Clot,C/ Mallorca 316,106.581187,2,2,3176420.0,2819631.0,2¬∫,True,balcony empty,furnished empty,average,D,has_parking empty,41.3997,2.082139,Engel & V√∂lkers
3,listing_id empty,alquiler,SANTS,Sagrada Fam√≠lia,Calle Falsa 123,106.581187,3,2,2633490.0,54840.0,s√≥tano,False,True,furnished empty,a reformar,A,True,41.192377,2.0,Engel & V√∂lkers
4,5,buy,SANTS,Les Corts,C/ Gran Via 245,106.581187,2,2,2633490.0,2819631.0,4¬∫,True,False,True,average,F,True,41.192377,2.0,Particular


## LOAD: Guardar datos limpios


In [24]:
# Guardar el dataframe limpio como CSV
df_clean.to_csv("../data/housing-barcelona-clean.csv", index=False)

print("‚úÖ Datos limpios guardados en: ../data/housing-barcelona-clean.csv")
print(f"\nArchivo guardado exitosamente con {len(df_clean)} filas y {len(df_clean.columns)} columnas")


‚úÖ Datos limpios guardados en: ../data/housing-barcelona-clean.csv

Archivo guardado exitosamente con 10000 filas y 20 columnas


In [25]:
# CARGAR EN SQLITE: Crear Datawarehouse en SQLite
import sqlite3
from sqlalchemy import create_engine

print("=== CARGA EN SQLITE: CREANDO DATAWAREHOUSE ===\n")

# Ruta de la base de datos SQLite
db_path = "../warehouse/warehouse_pandas.db"

# Crear conexi√≥n usando SQLAlchemy (requerido para to_sql)
engine = create_engine(f'sqlite:///{db_path}', echo=False)

# Preparar datos para tablas dimensionales
print("üìä Preparando datos para tablas dimensionales...")

# Tabla dim_district
df_dim_district = pd.DataFrame({
    'district_name': df_clean['district'].unique()
}).dropna()
df_dim_district = df_dim_district[df_dim_district['district_name'] != 'district empty']
print(f"‚úÖ dim_district: {len(df_dim_district)} distritos √∫nicos")

# Tabla dim_neighborhood
df_dim_neighborhood = df_clean[['neighborhood', 'district']].drop_duplicates()
df_dim_neighborhood = df_dim_neighborhood[
    (df_dim_neighborhood['neighborhood'] != 'neighborhood empty') &
    (df_dim_neighborhood['district'] != 'district empty')
].rename(columns={'neighborhood': 'neighborhood_name', 'district': 'district_name'})
print(f"‚úÖ dim_neighborhood: {len(df_dim_neighborhood)} barrios √∫nicos")

# Tabla dim_operation
df_dim_operation = pd.DataFrame({
    'operation_type': df_clean['operation'].unique()
}).dropna()
df_dim_operation = df_dim_operation[df_dim_operation['operation_type'] != 'operation empty']
print(f"‚úÖ dim_operation: {len(df_dim_operation)} tipos de operaci√≥n √∫nicos")

# Tabla dim_agency
df_dim_agency = pd.DataFrame({
    'agency_name': df_clean['agency'].unique()
}).dropna()
df_dim_agency = df_dim_agency[df_dim_agency['agency_name'] != 'agency empty']
print(f"‚úÖ dim_agency: {len(df_dim_agency)} agencias √∫nicas")

# Tabla dim_condition
df_dim_condition = pd.DataFrame({
    'condition_type': df_clean['condition'].unique()
}).dropna()
df_dim_condition = df_dim_condition[df_dim_condition['condition_type'] != 'condition empty']
print(f"‚úÖ dim_condition: {len(df_dim_condition)} condiciones √∫nicas")

# Tabla dim_energy_certificate
df_dim_energy_certificate = pd.DataFrame({
    'certificate_type': df_clean['energy_certificate'].unique()
}).dropna()
df_dim_energy_certificate = df_dim_energy_certificate[
    df_dim_energy_certificate['certificate_type'] != 'energy_certificate empty'
]
print(f"‚úÖ dim_energy_certificate: {len(df_dim_energy_certificate)} certificados √∫nicos")

# Guardar tablas dimensionales en SQLite
print("\nüíæ Guardando tablas dimensionales en SQLite...")
df_dim_district.to_sql('dim_district', engine, if_exists='replace', index=False)
df_dim_neighborhood.to_sql('dim_neighborhood', engine, if_exists='replace', index=False)
df_dim_operation.to_sql('dim_operation', engine, if_exists='replace', index=False)
df_dim_agency.to_sql('dim_agency', engine, if_exists='replace', index=False)
df_dim_condition.to_sql('dim_condition', engine, if_exists='replace', index=False)
df_dim_energy_certificate.to_sql('dim_energy_certificate', engine, if_exists='replace', index=False)
print("‚úÖ Tablas dimensionales guardadas")

# Preparar tabla de hechos (fact_housing)
print("\nüìä Preparando tabla de hechos...")
df_fact_housing = df_clean.copy()
print(f"‚úÖ fact_housing: {len(df_fact_housing)} filas preparadas")

# Guardar tabla de hechos en SQLite
print("\nüíæ Guardando tabla de hechos en SQLite...")
df_fact_housing.to_sql('fact_housing', engine, if_exists='replace', index=False)
print("‚úÖ Tabla de hechos guardada")

print(f"\n‚úÖ Datawarehouse creado exitosamente en: {db_path}")
print(f"   ‚Ä¢ 1 tabla de hechos: fact_housing")
print(f"   ‚Ä¢ 6 tablas de dimensiones: dim_district, dim_neighborhood, dim_operation, dim_agency, dim_condition, dim_energy_certificate")


=== CARGA EN SQLITE: CREANDO DATAWAREHOUSE ===

üìä Preparando datos para tablas dimensionales...
‚úÖ dim_district: 14 distritos √∫nicos
‚úÖ dim_neighborhood: 168 barrios √∫nicos
‚úÖ dim_operation: 6 tipos de operaci√≥n √∫nicos
‚úÖ dim_agency: 6 agencias √∫nicas
‚úÖ dim_condition: 6 condiciones √∫nicas
‚úÖ dim_energy_certificate: 7 certificados √∫nicos

üíæ Guardando tablas dimensionales en SQLite...
‚úÖ Tablas dimensionales guardadas

üìä Preparando tabla de hechos...
‚úÖ fact_housing: 10000 filas preparadas

üíæ Guardando tabla de hechos en SQLite...
‚úÖ Tabla de hechos guardada

‚úÖ Datawarehouse creado exitosamente en: ../warehouse/warehouse_pandas.db
   ‚Ä¢ 1 tabla de hechos: fact_housing
   ‚Ä¢ 6 tablas de dimensiones: dim_district, dim_neighborhood, dim_operation, dim_agency, dim_condition, dim_energy_certificate
