# Notebook Data Ingestion

In this section, we will retrieve data from "Datos abiertos".

#### Initial data load

In [0]:

# Paso 1: Descargar los datos con requests y leerlos en pandas
import requests
import pandas as pd
from io import StringIO
from pyspark.sql import SparkSession

spark = SparkSession.builder.getOrCreate()

url_secop = "https://www.datos.gov.co/resource/rpmr-utcd.csv?$limit=100000"
url_men = "https://www.datos.gov.co/resource/nudc-7mev.csv?$limit=100000"
 
# Descargar contenido
response_secop = requests.get(url_secop)
response_men = requests.get(url_men)

# Convertir contenido a pandas usando StringIO
df_secop_pd = pd.read_csv(StringIO(response_secop.text))
df_men_pd = pd.read_csv(StringIO(response_men.text))

# Convertir pandas a Spark
df_secop = spark.createDataFrame(df_secop_pd)
df_men = spark.createDataFrame(df_men_pd)

# Mostrar en Databricks
display(df_secop)
display(df_men)

In [0]:
df_secop.count()


In [0]:
%sql
CREATE CATALOG main;

In [0]:
%sql
CREATE SCHEMA IF NOT EXISTS main.diplomado_datos;

In [0]:
spark.sql("USE CATALOG main")

In [0]:
df_secop.write.format("delta").mode("overwrite").saveAsTable("main.diplomado_datos.secop")
df_men.write.format("delta").mode("overwrite").saveAsTable("main.diplomado_datos.men_estadisticas")

print("¡Tablas guardadas exitosamente en el catálogo 'main', esquema 'diplomado_datos'!")

#### Complete records download

In [0]:
import requests

# Consultar cuántos registros hay actualmente en el dataset SECOP
count_url = "https://www.datos.gov.co/resource/rpmr-utcd.json?$select=count(*)"
response = requests.get(count_url)

if response.status_code == 200:
    total_records = int(response.json()[0]['count'])
    print(f"Total de registros detectados: {total_records}")
else:
    print("No se pudo obtener el total de registros. Usando valor por defecto.")
    total_records = 19446266  # Valor fijo como respaldo

In [0]:
import time
from pyspark.sql.functions import col, when

limit = 100000
offset = 100000

def safe_cast(df, target_schema):
    df_casted = df
    for field in target_schema.fields:
        name = field.name
        dtype = field.dataType
        if dtype.simpleString() in ['int', 'bigint', 'double', 'float', 'long']:
            df_casted = df_casted.withColumn(
                name,
                when(col(name).rlike("^[0-9]+$"), col(name).cast(dtype)).otherwise(None)
            )
        else:
            df_casted = df_casted.withColumn(name, col(name).cast(dtype))
    return df_casted

start_time = time.time()  # Tiempo inicio

while offset < total_records:
    print(f"Descargando registros desde {offset} hasta {offset + limit}...")

    url_secop = f"https://www.datos.gov.co/resource/rpmr-utcd.csv?$limit={limit}&$offset={offset}"
    response_secop = requests.get(url_secop)

    df_secop_pd = pd.read_csv(
        StringIO(response_secop.text),
        delimiter=',',
        header=0,
        dtype=str,
        low_memory=False
    )
    
    if df_secop_pd.empty:
        print("No hay más datos para descargar.")
        break

    df_secop_spark = spark.createDataFrame(df_secop_pd.astype(str))

    target_schema = spark.table("main.diplomado_datos.secop").schema

    df_secop_aligned = safe_cast(df_secop_spark, target_schema)

    df_secop_aligned.write.format("delta") \
        .mode("append") \
        .option("mergeSchema", "true") \
        .saveAsTable("main.diplomado_datos.secop")

    print(f"Datos del offset {offset} guardados.")
    offset += limit

end_time = time.time()  # Tiempo fin

total_seconds = end_time - start_time
print(f"Carga completa de SECOP en {total_seconds:.2f} segundos.")

# Opcional: formato legible horas, minutos, segundos
hours = int(total_seconds // 3600)
minutes = int((total_seconds % 3600) // 60)
seconds = int(total_seconds % 60)

print(f"Tiempo total: {hours}h {minutes}m {seconds}s")


# Limpieza de datos