In [1]:
import pandas as pd
import numpy as np
pd.set_option('mode.chained_assignment',None)
import matplotlib.pyplot as plt
import seaborn as sns
import math
from pickle import dump
from sklearn.preprocessing import MinMaxScaler
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import codigo
from codigo import utils
from codigo import model_utils

# Carga del conjunto de datos inicial

In [3]:
df_Inicial = pd.read_csv(r'dataset/FreeStyle.csv')

# Vista previa del conjunto de datos

In [4]:
df_Inicial.head()

Unnamed: 0,ID,Date,Glucose level
0,LIB193263,2020-06-09 19:08:00,99.0
1,LIB193263,2020-06-09 19:23:00,92.0
2,LIB193263,2020-06-09 19:38:00,86.0
3,LIB193263,2020-06-09 19:53:00,85.0
4,LIB193263,2020-06-09 20:08:00,85.0


Unnamed: 0,ID,Date,Glucose level
0,LIB193263,2020-06-09 19:08:00,99.0
1,LIB193263,2020-06-09 19:23:00,92.0
2,LIB193263,2020-06-09 19:38:00,86.0
3,LIB193263,2020-06-09 19:53:00,85.0
4,LIB193263,2020-06-09 20:08:00,85.0


In [5]:
df_Inicial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   ID             object 
 1   Date           object 
 2   Glucose level  float64
dtypes: float64(1), object(2)
memory usage: 68.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column         Dtype  
---  ------         -----  
 0   ID             object 
 1   Date           object 
 2   Glucose level  float64
dtypes: float64(1), object(2)
memory usage: 68.7+ MB


# Ajuste de tipos de datos

In [6]:
df_Inicial['Date']=pd.to_datetime(df_Inicial['Date'])

In [7]:
df_Inicial.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column         Dtype         
---  ------         -----         
 0   ID             object        
 1   Date           datetime64[ns]
 2   Glucose level  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 68.7+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000001 entries, 0 to 3000000
Data columns (total 3 columns):
 #   Column         Dtype         
---  ------         -----         
 0   ID             object        
 1   Date           datetime64[ns]
 2   Glucose level  float64       
dtypes: datetime64[ns](1), float64(1), object(1)
memory usage: 68.7+ MB


In [8]:
df_Inicial['Date'].min()

Timestamp('2016-10-05 08:47:00')

Timestamp('2016-10-05 08:47:00')

In [9]:
df_Inicial['Date'].max()

Timestamp('2044-03-22 03:44:00')

Timestamp('2044-03-22 03:44:00')

# Limpieza del conjunto de datos

## Elimininamos los registros duplicados por paciente y fecha

In [None]:
df_Inicial=df_Inicial.drop_duplicates(['ID','Date'],keep='first')
df_Inicial.reset_index(drop=True, inplace=True)

In [None]:
df_Inicial.info()

## Seleccionamos el rango de fechas validas

In [None]:
DateStart='2018-01-06';
df_depurado=df_Inicial[df_Inicial['Date'] >=DateStart]
df_depurado.info()

In [None]:
DateEnd='2022-03-21';
df_depurado=df_depurado[df_depurado['Date'] <=DateEnd]
df_depurado.info()

In [None]:
df_depurado.describe()

In [None]:
df=df_depurado

In [None]:
df['Date'].min()

In [None]:
df['Date'].max()

In [None]:
df.to_csv('dataset/generado/datasetDepurado.csv', index=False) 

In [None]:
df_Inicial = df

In [None]:
df_Inicial.info()

## Detección y tratamiento de datos faltantes

### Analizamos los datos de 1 paciente con las muestras ordenadas cada 15 min

In [None]:
pacienteID="LIB193277"
paciente = utils.getDataPatient(df,pacienteID,'15min')
paciente

Realizamos un análisis exploratorio

In [None]:
profile = ProfileReport(paciente, title="Pandas Profiling Report",
    missing_diagrams=None,
    correlations=None, interactions=None,samples=None)
profile.to_file("analisisExploratorio/analisis_paciente_ID_LIB193277.html")

In [None]:
paciente_Null=paciente.isnull()
paciente_Null.sum()

Se observa que no todos los registros fueron tomados cada 15 min, y existen datos faltantes

In [None]:
paciente_Null.tail()

In [None]:
paciente['Glucose level'].plot(figsize=(20, 10))

In [None]:
import missingno as msno
msno.matrix(paciente)

### Estrategia de reemplazo de datos faltantes

In [None]:
ObjRangeDateStart='2020-6-12';
ObjRangeDateEnd='2020-6-25';

In [None]:
utils.plotRangeDates(paciente,ObjRangeDateStart,ObjRangeDateEnd)

Intentar usar -1 como valor por defecto para datos faltantes

In [None]:
paciente_fill=utils.fillNullData(paciente,'-1')
utils.plotTwoDf(paciente_fill,paciente,ObjRangeDateStart,ObjRangeDateEnd)

ffill reemplaza los valores faltantes con el valor de la muestra anterior

In [None]:
paciente_fill=utils.fillNullData(paciente,'ffill')
utils.plotTwoDf(paciente_fill,paciente,ObjRangeDateStart,ObjRangeDateEnd)

bfill reemplaza los valores faltantes con el valor de la muestra siguiente

In [None]:
paciente_fill=utils.fillNullData(paciente,'bfill')
utils.plotTwoDf(paciente_fill,paciente,ObjRangeDateStart,ObjRangeDateEnd)

interpolate_linear reemplaza los valores faltantes con un valor que pasa por la linea entre 2 puntos datos

In [None]:
paciente_fill=utils.fillNullData(paciente,'interpolate_linear')
utils.plotTwoDf(paciente_fill,paciente,ObjRangeDateStart,ObjRangeDateEnd)

In [None]:
paciente_fill_Null=paciente_fill.isnull()
paciente_fill_Null.sum()

###	Normalización de los datos

En este paso solo vamos a entrenar los normalizadores que vamos a utilizar, luego integraremos todos los pasos en una función general

#### Entrenamiento para datos de glucosa

In [None]:
df['Glucose level'].head()

In [None]:
scalerGlucosa = MinMaxScaler(feature_range=(0,1))

df[['Glucose level']]=scalerGlucosa.fit_transform(df[['Glucose level']].values)
dump(scalerGlucosa, open('dataset/generado/scalerGlucosa.scaler', 'wb'))

In [None]:
df

In [None]:
df[['Glucose level']]=scalerGlucosa.inverse_transform(df[['Glucose level']].values)

In [None]:
df

#### Entrenamiento para datos de horas

In [None]:
arrayHours=[]
for i in range(24):
    arrayHours.append(i)


dfHours= pd.DataFrame(arrayHours)
dfHours.columns = ['hours']
dfHours

In [None]:
scalerHours = MinMaxScaler()

arrayHours=scalerHours.fit_transform(dfHours[['hours']].values)
dump(scalerHours, open('dataset/generado/scalerHours.scaler', 'wb'))
arrayHours

In [None]:
arrayHoursT=scalerHours.inverse_transform([[1]])
arrayHoursT[0]

#### Entrenamiento para datos de minutos

In [None]:
arrayMins=[]
for i in range(60):
    arrayMins.append(i)


dfMins= pd.DataFrame(arrayMins)
dfMins.columns = ['mins']
dfMins

In [None]:
scalerMin = MinMaxScaler()

arrayMins=scalerMin.fit_transform(dfMins[['mins']].values)
dump(scalerMin, open('dataset/generado/scalerMin.scaler', 'wb'))
arrayMins

In [None]:
arrayMinT=scalerMin.inverse_transform([[1]])
arrayMinT[0]

#### Entrenamiento para datos de partes del día

In [None]:
arrayPod=[]
for i in range(6):
    arrayPod.append(i+1)


dfPod= pd.DataFrame(arrayPod)
dfPod.columns = ['podid']
dfPod

In [None]:
scalerPodId = MinMaxScaler()

arrayPod=scalerPodId.fit_transform(dfPod[['podid']].values)
dump(scalerPodId, open('dataset/generado/scalerPodId.scaler', 'wb'))
arrayPod

In [None]:
arrayPodT=scalerPodId.inverse_transform([[0.6]])
arrayPodT[0]

#### Entrenamiento para datos de niveles de glucosa

In [None]:
arrayLevel=[-1,0,1]
dfLevel= pd.DataFrame(arrayLevel)
dfLevel.columns = ['levelid']
dfLevel

In [None]:
scalerLevelId = MinMaxScaler()

arrayLevel=scalerLevelId.fit_transform(dfLevel[['levelid']].values)
dump(scalerLevelId, open('dataset/generado/scalerLevelId.scaler', 'wb'))
arrayLevel

In [None]:
arrayLevelT=scalerLevelId.inverse_transform([[0.5]])
arrayLevelT[0]

### Ingeniería de características

In [None]:
df

In [None]:
paciente_data = utils.getDataPatient(df,pacienteID,'15min',False)

In [None]:
paciente_data

In [None]:
paciente_data=utils.generateNewColumns(paciente_data,scalerLevelId,scalerHours,scalerMin,scalerPodId,scalerGlucosa,True)

In [None]:
paciente_data