<a href="https://colab.research.google.com/github/veiro/tesis-imputacion-datos/blob/main/codigo/imputadores/sk_imputer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# Imputing missing values before building an estimator
- imputation by the constant value 0
- imputation by the mean value of each feature combined with a missing-ness
  indicator auxiliary variable
- k nearest neighbor imputation
- iterative imputation



In [None]:
# Authors: Maria Telenczuk  <https://github.com/maikia>
# License: BSD 3 clause

# Configuracion

In [None]:
USAR_GOOGLE_DRIVE =  True
MODO_DESARROLLO = False



In [None]:
import os
import sys
import random
import numpy as np
import pandas as pd
from math import sqrt

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from google.colab import drive
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tqdm import tqdm


#Descarga de datos y preprocesamiento





In [None]:
%%time
if (USAR_GOOGLE_DRIVE):
  drive.mount('/content/gdrive')
  PATH_DATA_PROCESADA = "/content/gdrive/MyDrive/Maestria/Tesis/Codigo/data/data-procesada"
  sys.path.append('/content/gdrive/MyDrive/Maestria/Tesis/Codigo/utils')

else:
  PATH_DATA_PROCESADA = "/mnt/data"
  sys.path.append('/mnt/utils')

import utils as utils

import importlib
importlib.reload(utils)


if (30 ==  utils.version()):
  print("version correcta")
else:
  raise Exception("Version de Util vieja")

Mounted at /content/gdrive
version correcta
CPU times: user 3.8 s, sys: 506 ms, total: 4.31 s
Wall time: 27 s


In [None]:
!python3 -m pip freeze

absl-py==1.4.0
aiohttp==3.9.5
aiosignal==1.3.1
alabaster==0.7.16
albumentations==1.3.1
altair==4.2.2
annotated-types==0.7.0
anyio==3.7.1
argon2-cffi==23.1.0
argon2-cffi-bindings==21.2.0
array_record==0.5.1
arviz==0.15.1
astropy==5.3.4
astunparse==1.6.3
async-timeout==4.0.3
atpublic==4.1.0
attrs==23.2.0
audioread==3.0.1
autograd==1.6.2
Babel==2.15.0
backcall==0.2.0
beautifulsoup4==4.12.3
bidict==0.23.1
bigframes==1.6.0
bleach==6.1.0
blinker==1.4
blis==0.7.11
blosc2==2.0.0
bokeh==3.3.4
bqplot==0.12.43
branca==0.7.2
build==1.2.1
CacheControl==0.14.0
cachetools==5.3.3
catalogue==2.0.10
certifi==2024.2.2
cffi==1.16.0
chardet==5.2.0
charset-normalizer==3.3.2
chex==0.1.86
click==8.1.7
click-plugins==1.1.1
cligj==0.7.2
cloudpathlib==0.16.0
cloudpickle==2.2.1
cmake==3.27.9
cmdstanpy==1.2.2
colorcet==3.1.0
colorlover==0.3.0
colour==0.1.5
community==1.0.0b1
confection==0.1.4
cons==0.4.6
contextlib2==21.6.0
contourpy==1.2.1
cryptography==42.0.7
cuda-python==12.2.1
cudf-cu12 @ https://pypi.nvidia.c

# Utils

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

df_resultados = pd.DataFrame(columns=['Strategy', 'rmse'])


def guardoResultados(strat, rmse):
  global df_resultados
  print("Strategy: " + strat + ' - RMSE: ' + str(rmse))
  pd.concat([df_resultados , pd.DataFrame([{ 'Strategy':strat, 'rmse':rmse }])])


def calcularMetricas(data, imputed_data, mask, strat):
  imputed_data_df = pd.DataFrame(imputed_data, columns = data.columns)
  rmse = utils.rmse(data.to_numpy(),  imputed_data_df.to_numpy(), mask)
  data, imputed_data_df = utils.obtener_df_imputados(data, imputed_data_df, mask)
  guardoResultados(strat, rmse)
  decoded_imputed_data_df =  utils.decode(imputed_data_df, data, PATH_DATA_PROCESADA)
  decoded_imputed_data_df.to_csv(PATH_DATA_PROCESADA+'/csse_covid_19_data_strategy='+strat+'_'+'Desarrollo='+str(MODO_DESARROLLO)+ '.csv', index=False)



In [None]:
df_train_encoded, df_train_encoded_missing, mask_train_open = utils.obtenerDatosTrain(MODO_DESARROLLO, PATH_DATA_PROCESADA)
df_test_encoded, df_test_encoded_missing, mask_test_open = utils.obtenerDatosTest(MODO_DESARROLLO, PATH_DATA_PROCESADA)

df_test_encoded = pd.DataFrame(df_test_encoded, columns = df_train_encoded.columns)


# Imputacion

In [None]:
%%time
from sklearn.impute import SimpleImputer

strategy = ['mean', 'median', 'most_frequent', 'constant']

for strat in strategy:
  imp = SimpleImputer(missing_values=np.nan, strategy=strat)
  # train
  imp.fit(df_train_encoded_missing)
  #test
  imputed_data = imp.transform(df_test_encoded_missing)
  calcularMetricas(df_test_encoded, imputed_data, mask_test_open, strat)
  del imp
  del imputed_data



Strategy: mean - RMSE: 28318.109369500897


897667it [02:12, 6791.46it/s]


Strategy: median - RMSE: 28673.434503305656


897667it [02:27, 6101.99it/s]


Strategy: most_frequent - RMSE: 28781.188546184534


897667it [02:19, 6449.71it/s]


Strategy: constant - RMSE: 28781.212087124954


897667it [02:20, 6393.55it/s]


In [None]:
%%time
import numpy as np
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imp = IterativeImputer(max_iter=10, random_state=0)
imp.fit(df_train_encoded_missing)
imputed_data=imp.transform(df_test_encoded_missing)
calcularMetricas(df_test_encoded, imputed_data, mask_test_open, 'IterativeImputer')
del imp
del imputed_data



Strategy: IterativeImputer - RMSE: 19036.470040735236


897667it [02:17, 6521.31it/s]


CPU times: user 1h 27s, sys: 48min 17s, total: 1h 48min 45s
Wall time: 26min 47s
