In [1]:
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
data = pd.read_csv("penguins-bi.csv")
data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,0.0,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,1.0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,1.0,1,0,0,0,0,1
3,,,,,,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,1.0,1,0,0,0,0,1


In [3]:
data.shape

(344, 11)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   bill_length_mm     318 non-null    float64
 1   bill_depth_mm      342 non-null    float64
 2   flipper_length_mm  342 non-null    float64
 3   body_mass_g        323 non-null    float64
 4   sex                333 non-null    float64
 5   species_Adelie     344 non-null    int64  
 6   species_Chinstrap  344 non-null    int64  
 7   species_Gentoo     344 non-null    int64  
 8   island_Biscoe      344 non-null    int64  
 9   island_Dream       344 non-null    int64  
 10  island_Torgersen   344 non-null    int64  
dtypes: float64(5), int64(6)
memory usage: 29.7 KB


In [5]:
data.isnull().sum()

bill_length_mm       26
bill_depth_mm         2
flipper_length_mm     2
body_mass_g          21
sex                  11
species_Adelie        0
species_Chinstrap     0
species_Gentoo        0
island_Biscoe         0
island_Dream          0
island_Torgersen      0
dtype: int64

## **Imputación de valores con medidas de tendencia central** 
- Reemplazo los valores faltantes de las columnas *'bill_depth_mm'* y *'flipper_length_mm'*  con el **valor promedio** de los datos válidos de las mismas.
<br> <br>
- También podría reemplazarse con valores como la **mediana** o la **moda**. <br> Todas estas medidas solo tienen en cuenta la distribución de esa sola variable (columna)

In [6]:
data['bill_depth_mm'] = data['bill_depth_mm'].fillna(data['bill_depth_mm'].mean())

In [7]:
data['flipper_length_mm'] = data['flipper_length_mm'].fillna(data['flipper_length_mm'].mean())

- Para la variable *'sex'* se utiliza el método **.fillna()** con el parámetro **method = forward-fill**, el cual completa los valores faltantes tomando al último valor no-nulo observado y propagandolo hasta encontrar otro valor no-nulo.

In [8]:
data['sex'] = data['sex'].fillna(method='ffill')

- La suma de los **valores faltantes** de las variables *'bill_depth_mm'* , *'flipper_length_mm'* y *'sex'* ahora es 0

In [9]:
data.isnull().sum()

bill_length_mm       26
bill_depth_mm         0
flipper_length_mm     0
body_mass_g          21
sex                   0
species_Adelie        0
species_Chinstrap     0
species_Gentoo        0
island_Biscoe         0
island_Dream          0
island_Torgersen      0
dtype: int64

##  Imputación de valores con un modelo de regresión lineal
- Se utilizan los datos de las otras variables (columnas) para predecir los valores faltantes mediante un modelo de regresión, el cual estima la relación entre una variable dependiente (y) y una independiente (x).

- Primero separo los **valores nulos** de la variable *'bill_length_mm'* y *body_mass_g* y los guardo en una variable como un dataset para **prueba**

In [10]:
test = data[data['bill_length_mm'].isnull()==True]
test = data[data['body_mass_g'].isnull()==True]
test.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
3,,17.15117,200.915205,,1.0,1,0,0,0,0,1
7,,19.6,195.0,,0.0,1,0,0,0,0,1
11,,17.3,180.0,,0.0,1,0,0,0,0,1
12,,17.6,182.0,,1.0,1,0,0,0,0,1
18,,18.4,184.0,,1.0,1,0,0,0,0,1


In [11]:
test.shape

(21, 11)

In [12]:
test.notnull().sum()

bill_length_mm        2
bill_depth_mm        21
flipper_length_mm    21
body_mass_g           0
sex                  21
species_Adelie       21
species_Chinstrap    21
species_Gentoo       21
island_Biscoe        21
island_Dream         21
island_Torgersen     21
dtype: int64

 - En el grupo de **test** hay 2 valores **no nulos** en la variable *'bill_length_mm'*, los cuales los reemplazo por un NaN

In [13]:
test['bill_length_mm'].unique()

array([ nan, 42.1, 36.7])

In [14]:
import numpy as np

In [15]:
test['bill_length_mm'] = test['bill_length_mm'].replace(42.1, np.nan)
test['bill_length_mm'] = test['bill_length_mm'].replace(36.7, np.nan)

In [16]:
test['bill_length_mm'].unique()

array([nan])

- Luego creo otra variable para **entrenar** al modelo con un dataset que contiene todos los valores **no nulos** de la columna *'bill_length_mm'* y *'body_mass_g'*

In [17]:
train = data[data['bill_length_mm'].isnull()==False]
train = data[data['body_mass_g'].isnull()==False]
train.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,0.0,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,1.0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,1.0,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,1.0,1,0,0,0,0,1
5,39.3,20.6,190.0,3650.0,0.0,1,0,0,0,0,1


In [18]:
train.shape

(323, 11)

In [19]:
train.notnull().sum()

bill_length_mm       316
bill_depth_mm        323
flipper_length_mm    323
body_mass_g          323
sex                  323
species_Adelie       323
species_Chinstrap    323
species_Gentoo       323
island_Biscoe        323
island_Dream         323
island_Torgersen     323
dtype: int64

- En el grupo de **train** hay 7 **valores no nulos** en la variable *'bill_length_mm'*,  los cuales reemplazo con el valor promedio

In [20]:
train['bill_length_mm'] = train['bill_length_mm'].fillna(train['bill_length_mm'].mean())

- Creo variables **x_train, y_train** a partir de los datos separados para el entrenamiento

In [21]:
cols = ['bill_length_mm', 'body_mass_g' ]

In [22]:
x_train = train.drop(cols, axis=1)
y_train = train[cols]

- Importo el **algoritmo de regresió lineal**

In [23]:
from sklearn.linear_model import LinearRegression

In [24]:
regresion_lineal = LinearRegression()

- **Entreno** los datos

In [25]:
regresion_lineal.fit(x_train, y_train)

LinearRegression()

- Primero elimino en dataset de **prueba** las columnas de las cuales quiero predecir los valores faltantes
<br>
- Luego aplico el modelo para hacer la **predicción** a los valores que quedaron en el dataset de prueba 

In [26]:
test.drop(cols, axis=1, inplace=True)

In [27]:
prediccion = regresion_lineal.predict(test)

In [28]:
prediccion

array([[  38.75201168, 3547.63859718],
       [  41.4420073 , 4026.52956292],
       [  39.1295268 , 3522.6383057 ],
       [  37.22003633, 3220.00510617],
       [  37.73224487, 3332.52680497],
       [  39.69630749, 3768.89590735],
       [  39.54430549, 3815.01289831],
       [  40.57924985, 3956.96829909],
       [  40.58799734, 3961.66914727],
       [  37.90509327, 3366.82949694],
       [  41.23537439, 3980.58071377],
       [  38.16994502, 3427.79119451],
       [  41.17775825, 3969.14648311],
       [  35.62730744, 3071.48267344],
       [  36.63781748, 3210.07138298],
       [  38.62028803, 3644.02309932],
       [  41.14153246, 4083.38061589],
       [  38.20641949, 3548.54678326],
       [  37.60766241, 3301.37887772],
       [  37.06438764, 3384.84515157],
       [  45.75326654, 4762.90995259]])

- **Agrego la predicción** de las dos columnas al dataset de **prueba**

In [29]:
test[cols] = prediccion

In [30]:
test.head()

Unnamed: 0,bill_depth_mm,flipper_length_mm,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,bill_length_mm,body_mass_g
3,17.15117,200.915205,1.0,1,0,0,0,0,1,38.752012,3547.638597
7,19.6,195.0,0.0,1,0,0,0,0,1,41.442007,4026.529563
11,17.3,180.0,0.0,1,0,0,0,0,1,39.129527,3522.638306
12,17.6,182.0,1.0,1,0,0,0,0,1,37.220036,3220.005106
18,18.4,184.0,1.0,1,0,0,0,0,1,37.732245,3332.526805


- Ahora **no hay valores faltantes** en ninguno de los dos datasets

In [31]:
train.isnull().sum()

bill_length_mm       0
bill_depth_mm        0
flipper_length_mm    0
body_mass_g          0
sex                  0
species_Adelie       0
species_Chinstrap    0
species_Gentoo       0
island_Biscoe        0
island_Dream         0
island_Torgersen     0
dtype: int64

In [32]:
test.isnull().sum()

bill_depth_mm        0
flipper_length_mm    0
sex                  0
species_Adelie       0
species_Chinstrap    0
species_Gentoo       0
island_Biscoe        0
island_Dream         0
island_Torgersen     0
bill_length_mm       0
body_mass_g          0
dtype: int64

- **Fusiono** los dos datasets para que me quede nuevamente toda la información completa en uno solo sin valores faltantes

In [33]:
test.shape

(21, 11)

In [34]:
test.head()

Unnamed: 0,bill_depth_mm,flipper_length_mm,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen,bill_length_mm,body_mass_g
3,17.15117,200.915205,1.0,1,0,0,0,0,1,38.752012,3547.638597
7,19.6,195.0,0.0,1,0,0,0,0,1,41.442007,4026.529563
11,17.3,180.0,0.0,1,0,0,0,0,1,39.129527,3522.638306
12,17.6,182.0,1.0,1,0,0,0,0,1,37.220036,3220.005106
18,18.4,184.0,1.0,1,0,0,0,0,1,37.732245,3332.526805


In [35]:
train.shape

(323, 11)

In [36]:
train.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,0.0,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,1.0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,1.0,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,1.0,1,0,0,0,0,1
5,39.3,20.6,190.0,3650.0,0.0,1,0,0,0,0,1
