### Objetivo:
- Preparar el dataset para aplicar un modelo de regresión lineal y así poder predecir valores faltantes de las columnas *'bill_length_mm'* y *'body_mass_g'*

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv("penguins.csv")
data.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,MALE
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,FEMALE
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,FEMALE
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,FEMALE


In [3]:
data.shape

(344, 7)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     318 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        323 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
data[data.duplicated()]

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex


In [6]:
data.isnull().sum()

species               0
island                0
bill_length_mm       26
bill_depth_mm         2
flipper_length_mm     2
body_mass_g          21
sex                  11
dtype: int64

- ### Creo variables dummies para *'species'* y para *'island'*

In [7]:
data['species'].unique()

array(['Adelie', 'Chinstrap', 'Gentoo'], dtype=object)

In [8]:
data['island'].unique()

array(['Torgersen', 'Biscoe', 'Dream'], dtype=object)

In [9]:
dummy_species = pd.get_dummies(data['species'], prefix='species')
dummy_island = pd.get_dummies(data['island'], prefix='island')

In [10]:
data = pd.concat([data, dummy_species, dummy_island], axis=1)

In [11]:
data.drop(['species', 'island'], axis=1, inplace=True)

In [12]:
data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,MALE,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,FEMALE,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,FEMALE,1,0,0,0,0,1
3,,,,,,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,FEMALE,1,0,0,0,0,1


- ### Hago un mapeo de valores para *'sex'*

In [13]:
data['sex'].unique()

array(['MALE', 'FEMALE', nan], dtype=object)

In [14]:
diccionario = {'MALE':0, 'FEMALE':1}
data['sex'] = data['sex'].map(diccionario)

In [17]:
data.head()

Unnamed: 0,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,species_Adelie,species_Chinstrap,species_Gentoo,island_Biscoe,island_Dream,island_Torgersen
0,39.1,18.7,181.0,3750.0,0.0,1,0,0,0,0,1
1,39.5,17.4,186.0,3800.0,1.0,1,0,0,0,0,1
2,40.3,18.0,195.0,3250.0,1.0,1,0,0,0,0,1
3,,,,,,1,0,0,0,0,1
4,36.7,19.3,193.0,3450.0,1.0,1,0,0,0,0,1


#### Exporto el dataset

In [16]:
ruta = 'C:/Users/Tamara/Documents/DATA ANALYTICS/Proyectos/Ejercicios/penguins-bi.csv'

data.to_csv(ruta, index=False)