In [1]:
import urllib.request
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import polars as pl

ruta = "Prestaciones de salud asociadas a los asegurados con Diabetes Mellitus.csv"

schema_fix = {
    "CODPREST": pl.Utf8,
    "IDCIE10": pl.Utf8,
}

df = pl.read_csv(
    ruta,
    separator=",",
    encoding="latin1",
    schema_overrides=schema_fix,
    null_values=["", "S.I.", "S0001"],
    truncate_ragged_lines=True,  # This will truncate extra fields in long rows
    ignore_errors=True,          # This will skip rows that still can't be parsed
    try_parse_dates=True         # Helps with date parsing if needed
)



In [19]:
df.head(7)

ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,CODIGO_CONSUMO,NOMBRE_CONSUMO,PRESENTACION_MEDICAMENTO,FORMA_FARMACEUTICA,CANTIDAD_ENTREGADA,VALOR_NETO
i64,str,i64,i64,i8,str,i64,i64,str,str,str,i64,i64,str,i64,str,i64,str,i8,str,str,str,i8,i64,str,str,str,i64,f64
202309,"""18C9F497A5208F212E68D50C112AFD…",19830209,38,0,"""DIABETES MELLITUS TIPO 2""",,150108,"""LIMA""","""LIMA""","""CHORRILLOS""",1,9,"""ATENCIÃN PRENATAL""",0.0,"""DM0042047563""",20220106,"""ENFERMERO(A)""",0,"""Z348""","""SUPERVISION DE OTROS EMBARAZOS…","""DEFINITIVO """,1,20635,"""CALCIO CARBONATO""",,"""TAB""",60,0.0
202309,"""8337E3C9F5F90AB9ADADD63D319C47…",19760507,47,0,"""DIABETES MELLITUS TIPO 2""",,150106,"""LIMA""","""LIMA""","""CARABAYLLO""",1,56,"""CONSULTA EXTERNA""",,"""DM0043980595""",20230620,"""MEDICO""",1,"""F200""","""ESQUIZOFRENIA PARANOIDE""","""REPETIDO """,1,3874,"""HALOPERIDOL (COMO DECANOATO)""","""1 mL""","""INY""",2,0.0
202309,"""10991DD5539E8E7DA5F1543EE71B1D…",19781024,43,1,"""DIABETES MELLITUS NO ESPECIFIC…",20220612.0,150135,"""LIMA""","""LIMA""","""SAN MARTÃN DE PORRES""",3,65,"""INTERNAMIENTO EN EESS SIN INTE…",7.0,"""DM0043974259""",20220606,"""MEDICO""",1,"""C859""","""LINFOMA NO HODGKIN NO ESPECIF…","""REPETIDO """,1,35040,"""OMEPRAZOL""",,"""TAB_LM""",15,0.945
202309,"""9ED115C3E24F0B26F8B2BAD3849F2D…",19550720,67,0,"""DIABETES MELLITUS TIPO 2""",,150110,"""LIMA""","""LIMA""","""COMAS""",1,56,"""CONSULTA EXTERNA""",0.0,"""DM0044094856""",20220801,"""MEDICO""",1,"""J00X""","""RINOFARINGITIS AGUDA [RESFRIAD…","""DEFINITIVO """,1,4982,"""NAPROXENO""",,"""TAB""",8,0.0
202309,"""8F5CDAF371AFCFB9B93BD52BFE5682…",19540906,67,0,"""DIABETES MELLITUS TIPO 2""",,110301,"""ICA""","""NAZCA""","""NASCA""",2,62,"""ATENCIÃN POR EMERGENCIA""",,"""DM0044097912""",20220825,"""MEDICO""",1,"""J208""","""BRONQUITIS AGUDA DEBIDA A OTRO…","""DEFINITIVO """,1,4982,"""NAPROXENO""",,"""TAB""",10,0.0
202309,"""F0362A16D552BA88EC574825F40185…",19790224,43,0,"""DIABETES MELLITUS TIPO 2""",,60101,"""CAJAMARCA""","""CAJAMARCA""","""CAJAMARCA""",2,65,"""INTERNAMIENTO EN EESS SIN INTE…",15.0,"""DM0042244574""",20221117,"""MEDICO""",1,"""I10X""","""HIPERTENSION ESENCIAL (PRIMARI…","""DEFINITIVO """,1,4695,"""METFORMINA CLORHIDRATO""",,"""TAB""",102,0.0
202309,"""36CAB77EADAE0D4F1ECFD393D02D52…",19970421,24,0,"""DIABETES MELLITUS TIPO 2""",,200701,"""PIURA""","""TALARA""","""PARIÃAS""",1,62,"""ATENCIÃN POR EMERGENCIA""",,"""DM0033447540""",20220324,"""MEDICO""",0,"""K805""","""CALCULO DE CONDUCTO BILIAR SIN…","""DEFINITIVO """,1,4677,"""METAMIZOL SODICO""","""2 mL""","""INY""",1,0.0


In [21]:
df.shape

(45666822, 29)

In [5]:
df = df.drop("FECHA_CORTE")


In [6]:
df = df.filter(
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2019") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2020") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2021") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2022") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2023") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2024") |
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2025")
)


In [None]:
tiene_2020 = df.filter(
    pl.col("FECHA_ATENCION").cast(pl.Utf8).str.starts_with("2022")
).height > 0

print("¿Hay fechas que empiezan con 2020?", tiene_2022)


¿Hay fechas que empiezan con 2020? True


In [8]:
import polars as pl

# 1) Añade una columna con el año (primeros 4 caracteres).
df_anios = df.with_columns(
    pl.col("FECHA_ATENCION")
      .cast(pl.Utf8)          # asegúrate de que sea texto
      .str.slice(0, 4)        # extrae los 4 primeros caracteres
      .alias("anio")
)

# 2) Cuenta cuántos registros hay para 2022‑2025
cuentas_22_25 = (
    df_anios
    .filter(pl.col("anio").is_in(["2022", "2023", "2024", "2025"]))
    .group_by("anio")
    .count()                  # devuelve columnas: anio | count
    .sort("anio")             # orden ascendente
)

print(cuentas_22_25)

# 3) ¿Quedan fechas que empiecen con 2020?
hay_2020 = (
    df_anios
    .filter(pl.col("anio") == "2020")
    .height > 0
)

print("¿Hay fechas que empiezan con 2020?", hay_2020)


  .count()                  # devuelve columnas: anio | count


shape: (2, 2)
┌──────┬──────────┐
│ anio ┆ count    │
│ ---  ┆ ---      │
│ str  ┆ u32      │
╞══════╪══════════╡
│ 2022 ┆ 24017621 │
│ 2023 ┆ 21649201 │
└──────┴──────────┘
¿Hay fechas que empiezan con 2020? False


In [9]:
# Tamaño en bytes
size_bytes = df.estimated_size()

# Convertir a megabytes (MB) y gigabytes (GB)
size_mb = size_bytes / (1024 ** 2)
size_gb = size_bytes / (1024 ** 3)

print(f"Tamaño aproximado del DataFrame:")
print(f"- {size_mb:.2f} MB")
print(f"- {size_gb:.4f} GB")


Tamaño aproximado del DataFrame:
- 17290.49 MB
- 16.8852 GB


In [10]:
df = df.with_columns(
    pl.col("SEXO")
    .replace({"FEMENINO": 0, "MASCULINO": 1})
    .cast(pl.Int8)
    .alias("SEXO")
)


FEMENINO 0
MASCULINO 1

In [11]:
# Tamaño en bytes
size_bytes = df.estimated_size()

# Convertir a megabytes (MB) y gigabytes (GB)
size_mb = size_bytes / (1024 ** 2)
size_gb = size_bytes / (1024 ** 3)

print(f"Tamaño aproximado del DataFrame:")
print(f"- {size_mb:.2f} MB")
print(f"- {size_gb:.4f} GB")


Tamaño aproximado del DataFrame:
- 16972.13 MB
- 16.5743 GB


In [12]:
df['FECATE_POST_FECFED'].value_counts()

FECATE_POST_FECFED,count
str,u32
"""SI""",39020624
"""NO""",6646198


In [13]:
import polars as pl

df = df.with_columns(
    pl.col("FECATE_POST_FECFED")
      .str.to_uppercase()            # normaliza mayúsculas/minúsculas
      .replace({"NO": 0, "SI": 1})   # ahora sí coinciden todas
      .cast(pl.Int8)                 # ya solo hay enteros y nulos
      .alias("FECATE_POST_FECFED")
)


In [14]:
# Tamaño en bytes
size_bytes = df.estimated_size()

# Convertir a megabytes (MB) y gigabytes (GB)
size_mb = size_bytes / (1024 ** 2)
size_gb = size_bytes / (1024 ** 3)

print(f"Tamaño aproximado del DataFrame:")
print(f"- {size_mb:.2f} MB")
print(f"- {size_gb:.4f} GB")


Tamaño aproximado del DataFrame:
- 16928.58 MB
- 16.5318 GB


In [15]:
df['TIPO_CONSUMO'].value_counts()

TIPO_CONSUMO,count
str,u32
"""MEDICAMENTO""",14456467
"""REGISTRO SIN CONSUMO""",2923211
"""PROCEDIMIENTO""",21844927
"""INSUMO""",6442217


In [16]:
df = df.with_columns(
    pl.col("TIPO_CONSUMO")
      .replace({
          "INSUMO": 0,
          "MEDICAMENTO": 1,
          "PROCEDIMIENTO": 2,
          "REGISTRO SIN CONSUMO": 3
      })
      .cast(pl.Int8)
      .alias("TIPO_CONSUMO")
)


In [43]:
# Tamaño en bytes
size_bytes = df.estimated_size()

# Convertir a megabytes (MB) y gigabytes (GB)
size_mb = size_bytes / (1024 ** 2)
size_gb = size_bytes / (1024 ** 3)

print(f"Tamaño aproximado del DataFrame:")
print(f"- {size_mb:.2f} MB")
print(f"- {size_gb:.4f} GB")


Tamaño aproximado del DataFrame:
- 16433.11 MB
- 16.0480 GB


In [18]:

df['TIPO_DIAGNOSTICO'].value_counts()

TIPO_DIAGNOSTICO,count
str,u32
"""DEFINITIVO """,36005251
"""PRESUNTIVO """,3249876
"""REPETIDO """,6411695


## Data Quality and cleaning


In [22]:
df.head()

ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,CODIGO_CONSUMO,NOMBRE_CONSUMO,PRESENTACION_MEDICAMENTO,FORMA_FARMACEUTICA,CANTIDAD_ENTREGADA,VALOR_NETO
i64,str,i64,i64,i8,str,i64,i64,str,str,str,i64,i64,str,i64,str,i64,str,i8,str,str,str,i8,i64,str,str,str,i64,f64
202309,"""18C9F497A5208F212E68D50C112AFD…",19830209,38,0,"""DIABETES MELLITUS TIPO 2""",,150108,"""LIMA""","""LIMA""","""CHORRILLOS""",1,9,"""ATENCIÃN PRENATAL""",0.0,"""DM0042047563""",20220106,"""ENFERMERO(A)""",0,"""Z348""","""SUPERVISION DE OTROS EMBARAZOS…","""DEFINITIVO """,1,20635,"""CALCIO CARBONATO""",,"""TAB""",60,0.0
202309,"""8337E3C9F5F90AB9ADADD63D319C47…",19760507,47,0,"""DIABETES MELLITUS TIPO 2""",,150106,"""LIMA""","""LIMA""","""CARABAYLLO""",1,56,"""CONSULTA EXTERNA""",,"""DM0043980595""",20230620,"""MEDICO""",1,"""F200""","""ESQUIZOFRENIA PARANOIDE""","""REPETIDO """,1,3874,"""HALOPERIDOL (COMO DECANOATO)""","""1 mL""","""INY""",2,0.0
202309,"""10991DD5539E8E7DA5F1543EE71B1D…",19781024,43,1,"""DIABETES MELLITUS NO ESPECIFIC…",20220612.0,150135,"""LIMA""","""LIMA""","""SAN MARTÃN DE PORRES""",3,65,"""INTERNAMIENTO EN EESS SIN INTE…",7.0,"""DM0043974259""",20220606,"""MEDICO""",1,"""C859""","""LINFOMA NO HODGKIN NO ESPECIF…","""REPETIDO """,1,35040,"""OMEPRAZOL""",,"""TAB_LM""",15,0.945
202309,"""9ED115C3E24F0B26F8B2BAD3849F2D…",19550720,67,0,"""DIABETES MELLITUS TIPO 2""",,150110,"""LIMA""","""LIMA""","""COMAS""",1,56,"""CONSULTA EXTERNA""",0.0,"""DM0044094856""",20220801,"""MEDICO""",1,"""J00X""","""RINOFARINGITIS AGUDA [RESFRIAD…","""DEFINITIVO """,1,4982,"""NAPROXENO""",,"""TAB""",8,0.0
202309,"""8F5CDAF371AFCFB9B93BD52BFE5682…",19540906,67,0,"""DIABETES MELLITUS TIPO 2""",,110301,"""ICA""","""NAZCA""","""NASCA""",2,62,"""ATENCIÃN POR EMERGENCIA""",,"""DM0044097912""",20220825,"""MEDICO""",1,"""J208""","""BRONQUITIS AGUDA DEBIDA A OTRO…","""DEFINITIVO """,1,4982,"""NAPROXENO""",,"""TAB""",10,0.0


In [27]:
df.describe()


statistic,ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,CODIGO_CONSUMO,NOMBRE_CONSUMO,PRESENTACION_MEDICAMENTO,FORMA_FARMACEUTICA,CANTIDAD_ENTREGADA,VALOR_NETO
str,f64,str,f64,f64,f64,str,f64,f64,str,str,str,f64,f64,str,f64,str,f64,str,f64,str,str,str,f64,f64,str,str,str,f64,f64
"""count""",45666822.0,"""45666822""",45666822.0,45666822.0,45666822.0,"""45666822""",4234851.0,45666822.0,"""45666822""","""45666822""","""45666822""",45666822.0,45666821.0,"""45666822""",14383863.0,"""45666822""",45666822.0,"""45666803""",45666822.0,"""45666822""","""45666822""","""45666822""",45666822.0,41373914.0,"""42743611""","""5341166""","""20898681""",42743611.0,42743611.0
"""null_count""",0.0,"""0""",0.0,0.0,0.0,"""0""",41431971.0,0.0,"""0""","""0""","""0""",0.0,1.0,"""0""",31282959.0,"""0""",0.0,"""19""",0.0,"""0""","""0""","""0""",0.0,4292908.0,"""2923211""","""40325656""","""24768141""",2923211.0,2923211.0
"""mean""",202309.0,,19647000.0,57.292522,0.309926,,20227000.0,135858.059037,,,,1.89906,111.569133,,7.690697,,20225000.0,,0.854463,,,,1.465308,47777.640504,,,,49.752353,12.193144
"""std""",0.0,,164429.919976,16.446489,0.462463,,5368.494774,52177.174301,,,,0.837164,204.260502,,16.349572,,4921.42565,,0.352641,,,,0.811763,52957.653455,,,,5083.235038,176.448667
"""min""",202309.0,"""000015CA7003D37272A17DE250F6AF…",19040512.0,0.0,0.0,"""DIABETES MELLITUS ASOCIADA CON…",20220101.0,10101.0,"""AMAZONAS""","""ABANCAY""","""ABANCAY""",0.0,1.0,"""APOYO AL DIAGNÃSTICO""",0.0,"""DM0023457864""",20220101.0,"""AUXILIAR DE ENFERMERIA""",0.0,"""A000""","""(OSTEO)ARTROSIS EROSIVA""","""DEFINITIVO """,0.0,1.0,"""2 3 5-TRIFENIL-2H-TETRAZOLIUM …",""" ""","""AER""",1.0,0.0
"""25%""",202309.0,,19530609.0,47.0,0.0,,20220920.0,110501.0,,,,1.0,56.0,,0.0,,20220630.0,,1.0,,,,1.0,5103.0,,,,1.0,0.0
"""50%""",202309.0,,19630513.0,59.0,0.0,,20230324.0,150110.0,,,,2.0,56.0,,2.0,,20221213.0,,1.0,,,,2.0,35080.0,,,,1.0,0.65
"""75%""",202309.0,,19741203.0,69.0,1.0,,20230807.0,150142.0,,,,3.0,65.0,,9.0,,20230511.0,,1.0,,,,2.0,85018.0,,,,5.0,6.19
"""max""",202309.0,"""FFFFFBA01CCD1C2A4085CCA471BCF6…",20230926.0,118.0,1.0,"""SIN IDENTIFICAR""",20240226.0,250401.0,"""ÃNCASH""","""ZARUMILLA""","""ÃAHUIMPUQUIO""",3.0,911.0,"""TRATAMIENTO PROFILACTICO A NIÃ…",742.0,"""DM0090683210""",20230930.0,"""TRABAJADORA SOCIAL""",1.0,"""Z999""","""YERSINIOSIS EXTRAINTESTINAL""","""REPETIDO """,3.0,9712401.0,"""Ãcido Ãºrico; otra fuente""","""m3""","""UNI""",7377000.0,334461.6


## Limpiando valores nulos

In [74]:
# Count nulls per column
null_counts = [
    df.select(pl.col(col).is_null().sum()).item()
    for col in df.columns
]

# Total values in column
total_rows = df.height

# Build summary table
summary = pl.DataFrame({
    "column": df.columns,
    "null_count": null_counts,
    "null_percentage": [count / total_rows * 100 for count in null_counts]
}).sort("null_percentage", descending=True)

# Total nulls and overall percentage
total_nulls = sum(null_counts)
total_values = df.height * df.width
total_null_percentage = total_nulls / total_values * 100

# Force full display of the table
pl.Config.set_tbl_cols(len(summary.columns))  # Show all columns
pl.Config.set_tbl_rows(len(summary))          # Show all rows

# Print summary table
print("Null Summary Per Column:\n")
summary


Null Summary Per Column:



column,null_count,null_percentage
str,i64,f64
"""FECHA_FALLECIMIENTO""",41431971,90.726635
"""DIAS_HOSP""",31282959,68.502597
"""NOMBRE_CONSUMO""",2923211,6.40117
"""CANTIDAD_ENTREGADA""",2923211,6.40117
"""CODIGO_SERV_PRESTACIONAL""",1,2e-06
"""ULTIMO_MES_CONSUMOS""",0,0.0
"""CODIGO_ANONIMIZADO""",0,0.0
"""FECHA_NACIMIENTO""",0,0.0
"""EDAD""",0,0.0
"""SEXO""",0,0.0


In [None]:
# Print totals
print(f"\nTotal null values: {total_nulls}")
print(f"Percentage of total nulls: {total_null_percentage:.2f}%")


Total null values: 150871288
Percentage of total nulls: 11.39%


### FECHA_FALLECIMIENTO

Los valores nulos en fecha fallecimiento representan aquellos que aún no han fallecido

### PRESENTACION_MEDICAMENTO

In [None]:
df.select("PRESENTACION_MEDICAMENTO") \
  .group_by("PRESENTACION_MEDICAMENTO") \
  .agg(pl.count().alias("count")) \
  .sort("count", descending=True)


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count")) \


PRESENTACION_MEDICAMENTO,count
str,u32
,40325656
"""2 mL""",1234224
"""1 mL""",543246
"""3 mL""",483945
"""1 L""",464443
"""5 mL""",358903
"""10 mL""",325120
"""100 mL""",217513
"""20 mL""",205555
"""120 mL""",166769


In [42]:
df = df.drop("PRESENTACION_MEDICAMENTO")


### DIAS_HOSP 

Vemos que los nulos representan atenciones no presenciales dentro de un establecimiento de un hospital, mientras que 0 días representa horas que pudo haber estado dentro de un hospital como para la prevención de caries.

Posteriormente crearemos una variable ¿Estuvo hospitalizado?

In [46]:
df.select("DIAS_HOSP") \
  .group_by("DIAS_HOSP") \
  .agg(pl.count().alias("count")) \
  .sort("count", descending=True)


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count")) \


DIAS_HOSP,count
i64,u32
,31282959
0,6409974
3,668391
2,668045
4,573380
1,513168
5,507012
6,455787
7,437683
8,379899


In [54]:
import polars as pl

pl.Config.set_tbl_cols(100)  # or a large enough number to cover all your columns
pl.Config.set_tbl_width_chars(400)  # increase width if needed

df.filter(pl.col("DIAS_HOSP") == 0)


ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,CODIGO_CONSUMO,NOMBRE_CONSUMO,FORMA_FARMACEUTICA,CANTIDAD_ENTREGADA,VALOR_NETO
i64,str,i64,i64,i8,str,i64,i64,str,str,str,i64,i64,str,i64,str,i64,str,i8,str,str,str,i8,i64,str,str,i64,f64
202309,"""18C9F497A5208F212E68D50C112AFD…",19830209,38,0,"""DIABETES MELLITUS TIPO 2""",,150108,"""LIMA""","""LIMA""","""CHORRILLOS""",1,9,"""ATENCIÃN PRENATAL""",0,"""DM0042047563""",20220106,"""ENFERMERO(A)""",0,"""Z348""","""SUPERVISION DE OTROS EMBARAZOS…","""DEFINITIVO """,1,20635,"""CALCIO CARBONATO""","""TAB""",60,0.0
202309,"""9ED115C3E24F0B26F8B2BAD3849F2D…",19550720,67,0,"""DIABETES MELLITUS TIPO 2""",,150110,"""LIMA""","""LIMA""","""COMAS""",1,56,"""CONSULTA EXTERNA""",0,"""DM0044094856""",20220801,"""MEDICO""",1,"""J00X""","""RINOFARINGITIS AGUDA [RESFRIAD…","""DEFINITIVO """,1,4982,"""NAPROXENO""","""TAB""",8,0.0
202309,"""8B4E5D835EB1692921C929529C02CE…",19590806,62,0,"""DIABETES MELLITUS TIPO 1""",,60307,"""CAJAMARCA""","""CELENDÃN""","""MIGUEL IGLESIAS""",1,56,"""CONSULTA EXTERNA""",0,"""DM0042432103""",20220105,"""ENFERMERO(A)""",1,"""I10X""","""HIPERTENSION ESENCIAL (PRIMARI…","""REPETIDO """,1,4523,"""LOSARTAN POTASICO""","""TAB""",30,0.0
202309,"""F85467AB4F0FC3096046A688EB2558…",19850814,36,0,"""DIABETES MELLITUS NO ESPECIFIC…",,150143,"""LIMA""","""LIMA""","""VILLA MARÃA DEL TRIUNFO""",1,62,"""ATENCIÃN POR EMERGENCIA""",0,"""DM0040310575""",20220608,"""MEDICO""",0,"""J209""","""BRONQUITIS AGUDA NO ESPECIFIC…","""DEFINITIVO """,1,947,"""AZITROMICINA""","""TAB""",5,0.0
202309,"""17CFBEB082B7887DE145A55D698F7B…",19550918,66,0,"""DIABETES MELLITUS TIPO 2""",,150142,"""LIMA""","""LIMA""","""VILLA EL SALVADOR""",1,56,"""CONSULTA EXTERNA""",0,"""DM0044700429""",20220630,"""MEDICO""",1,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""DEFINITIVO """,1,3758,"""GLIBENCLAMIDA""","""TAB""",90,4.5
202309,"""736D9EBCDD690DF245186202B33BA8…",19650102,57,0,"""DIABETES MELLITUS TIPO 2""",,150132,"""LIMA""","""LIMA""","""SAN JUAN DE LURIGANCHO""",1,56,"""CONSULTA EXTERNA""",0,"""DM0056517096""",20220411,"""MEDICO""",1,"""E785""","""HIPERLIPIDEMIA NO ESPECIFICADA""","""REPETIDO """,1,904,"""ATORVASTATINA (COMO SAL CALCIC…","""TAB""",15,0.0
202309,"""0BA9C896DBB215860E1E575C4BD42F…",19690902,52,1,"""DIABETES MELLITUS TIPO 2""",,51107,"""AYACUCHO""","""VILCAS HUAMÃN""","""SAURAMA""",1,56,"""CONSULTA EXTERNA""",0,"""DM0042997257""",20220721,"""TEC. ENFERMERIA""",1,"""J039""","""AMIGDALITIS AGUDA NO ESPECIFI…","""DEFINITIVO """,1,2149,"""CLORFENAMINA MALEATO""","""TAB""",5,0.0
202309,"""60CD668F25E7BCDC64DE26BDAE0839…",19930612,29,0,"""DIABETES MELLITUS NO ESPECIFIC…",,150106,"""LIMA""","""LIMA""","""CARABAYLLO""",1,9,"""ATENCIÃN PRENATAL""",0,"""DM0033583454""",20220624,"""ENFERMERO(A)""",0,"""Z348""","""SUPERVISION DE OTROS EMBARAZOS…","""DEFINITIVO """,1,3513,"""ACIDO FOLICO + FERROSO SULFATO…","""TAB""",30,0.0
202309,"""37B86FE74C4A67B5953287BC74D693…",19700728,51,0,"""DIABETES MELLITUS TIPO 2""",,160107,"""LORETO""","""MAYNAS""","""NAPO""",1,56,"""CONSULTA EXTERNA""",0,"""DM0052671199""",20220420,"""PSICOLOGO""",0,"""E780""","""HIPERCOLESTEROLEMIA PURA""","""DEFINITIVO """,1,903,"""ATORVASTATINA (COMO SAL CALCIC…","""TAB""",30,0.0
202309,"""846FDAF0F065AC319421C928B8BE4D…",19540325,68,0,"""DIABETES MELLITUS TIPO 2""",,230110,"""TACNA""","""TACNA""","""CORONEL GREGORIO ALBARRACÃN L…",1,56,"""CONSULTA EXTERNA""",0,"""DM0052673530""",20220716,"""MEDICO""",1,"""M159""","""POLIARTROSIS NO ESPECIFICADA""","""PRESUNTIVO """,1,4982,"""NAPROXENO""","""TAB""",20,0.0


In [57]:
df.filter(
    pl.col("CODIGO_ANONIMIZADO").str.starts_with("E61A9207C93CC140C64BB93735F7E5")
)


ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,CODIGO_CONSUMO,NOMBRE_CONSUMO,FORMA_FARMACEUTICA,CANTIDAD_ENTREGADA,VALOR_NETO
i64,str,i64,i64,i8,str,i64,i64,str,str,str,i64,i64,str,i64,str,i64,str,i8,str,str,str,i8,i64,str,str,i64,f64
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,64,1,"""DIABETES MELLITUS TIPO 2""",,220204,"""SAN MARTÃN""","""BELLAVISTA""","""HUALLAGA""",1,910,"""TELEMONITOREO CON PRESCRIPCIÃ…",,"""DM0040911071""",20220923,"""PSICOLOGO""",0,"""E149""","""DIABETES MELLITUS NO ESPECIFI…","""REPETIDO """,2,,"""Telemonitoreo""",,1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0046235051""",20230503,"""OBSTETRIZ""",0,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,0,24728,"""COLESTEROL TOTAL ENZIMATICO AU…","""KIT""",1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0065969415""",20230804,"""OBSTETRIZ""",1,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,0,10436,"""TUBO CAPILARES PARA HEMATOCRIT…","""UNI""",1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220204,"""SAN MARTÃN""","""BELLAVISTA""","""HUALLAGA""",1,910,"""TELEMONITOREO CON PRESCRIPCIÃ…",,"""DM0064528597""",20230324,"""PSICOLOGO""",0,"""E149""","""DIABETES MELLITUS NO ESPECIFI…","""REPETIDO """,2,,"""Telemonitoreo""",,1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0046235051""",20230503,"""OBSTETRIZ""",0,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,2,85018,"""Hemoglobina""",,1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,64,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",0,"""DM0065208386""",20220407,"""OBSTETRIZ""",0,"""E789""","""TRASTORNO DEL METABOLISMO DE L…","""PRESUNTIVO """,0,31386,"""LAMINILLA CUBRE OBJETO 25 mm X…","""UNI""",1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0045670333""",20230508,"""OBSTETRIZ""",1,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,0,31340,"""TUBO PARA EXTRACCION DE SANGRE…","""UNI""",1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0065969415""",20230804,"""OBSTETRIZ""",1,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,0,24728,"""COLESTEROL TOTAL ENZIMATICO AU…","""KIT""",1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,64,1,"""DIABETES MELLITUS TIPO 2""",,220204,"""SAN MARTÃN""","""BELLAVISTA""","""HUALLAGA""",1,910,"""TELEMONITOREO CON PRESCRIPCIÃ…",,"""DM0065346886""",20220826,"""PSICOLOGO""",0,"""E149""","""DIABETES MELLITUS NO ESPECIFI…","""REPETIDO """,2,,"""Telemonitoreo""",,1,0.0
202309,"""E61A9207C93CC140C64BB93735F7E5…",19571010,65,1,"""DIABETES MELLITUS TIPO 2""",,220601,"""SAN MARTÃN""","""MARISCAL CÃCERES""","""JUANJUÃ""",2,71,"""APOYO AL DIAGNÃSTICO""",,"""DM0065969415""",20230804,"""OBSTETRIZ""",1,"""E119""","""DIABETES MELLITUS TIPO 2 SIN M…","""PRESUNTIVO """,0,31385,"""LAMINA PORTA OBJETO 25 mm X 75…","""UNI""",1,0.0


### FORMA_FARMACEUTICA

In [60]:

df.select("FORMA_FARMACEUTICA") \
  .group_by("FORMA_FARMACEUTICA") \
  .agg(pl.count().alias("count")) \
  .sort("count", descending=True)


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count")) \


FORMA_FARMACEUTICA,count
str,u32
,24768141
"""TAB""",8438924
"""UNI""",5778977
"""INY""",4436520
"""PAR""",535836
"""SOL""",270606
"""SUS""",268356
"""CRM""",227401
"""TAB_LM""",162027
"""JBE""",140576


In [61]:
df = df.drop("FORMA_FARMACEUTICA")


### CODIGO_CONSUMO

In [None]:
df_filtered = df.filter(
    pl.col("NOMBRE_CONSUMO").is_null() & pl.col("CODIGO_CONSUMO").is_not_null()
)

df_filtered

ULTIMO_MES_CONSUMOS,…,VALOR_NETO
i64,…,f64


In [None]:
df = df.drop("CODIGO_CONSUMO")

### NOMBRE_CONSUMO

### VALOR_NETO

In [None]:
import polars as pl

df = df.with_columns(
    pl.col("VALOR_NETO").fill_null(0)
)


### TIPO_PERSONAL_SALUD

In [73]:
df.select("TIPO_PERSONAL_SALUD") \
  .group_by("TIPO_PERSONAL_SALUD") \
  .agg(pl.count().alias("count")) \
  .sort("count", descending=True)


(Deprecated in version 0.20.5)
  .agg(pl.count().alias("count")) \


TIPO_PERSONAL_SALUD,count
str,u32
"""MEDICO""",36976876
"""ENFERMERO(A)""",1447995
"""AUXILIAR DE ENFERMERIA""",1307509
"""TECNOLOGO MEDICO""",1279968
"""PSICOLOGO""",1064757
"""OBSTETRIZ""",970923
"""BIOLOGO""",843908
"""TEC. ENFERMERIA""",768794
"""NUTRICIONISTA""",426233
"""FARMACEUTICO""",235792


In [72]:
pl.Config.set_tbl_cols(100)  # or a large enough number to cover all your columns
pl.Config.set_tbl_width_chars(400)  # increase width if needed


df.filter(
    (pl.col("TIPO_PERSONAL_SALUD").is_null()) 
)


ULTIMO_MES_CONSUMOS,CODIGO_ANONIMIZADO,FECHA_NACIMIENTO,EDAD,SEXO,TIPO_DIABETES,FECHA_FALLECIMIENTO,UBIGEO,DEPARTAMENTO,PROVINCIA,DISTRITO,NIVEL_EESS,CODIGO_SERV_PRESTACIONAL,SERVICIO_PRESTACIONAL,DIAS_HOSP,ID_REGISTRO_REL,FECHA_ATENCION,TIPO_PERSONAL_SALUD,FECATE_POST_FECFED,CODDIA,C10_NOMBRE,TIPO_DIAGNOSTICO,TIPO_CONSUMO,NOMBRE_CONSUMO,CANTIDAD_ENTREGADA,VALOR_NETO
i64,str,i64,i64,i8,str,i64,i64,str,str,str,i64,i64,str,i64,str,i64,str,i8,str,str,str,i8,str,i64,f64


In [71]:
df = df.with_columns(
    pl.col("TIPO_PERSONAL_SALUD").fill_null("AUXILIAR DE ENFERMERIA")
)


# EDA

# Data Transformation