In [29]:
import pandas as pd
import numpy as np
import seaborn as sns

In [30]:
df = sns.load_dataset('penguins')
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


As variáveis de tal conjunto podem ser descritas da seguinte forma:

* index: o índice do registro 
* species: um fator denotando a espécie do pinguin (Adélie, Chinstrap ou Gentoo)
* island: um fator denotando a ilha no arquipélago de Palmer, Antarctica (Biscoe, Dream ou Torgersen)
* bill_length_mm: um inteiro que denota o comprimento do bico de um pinguin (em milímetros)
* bill_depth_mm: um inteiro que denota a profundidade do bico de um pinguin (em milímetros)
* flipper_length_mm: um inteiro denotando o comprimento da nadadeira (em milímetros)
* body_mass_g: um inteiro denotando a massa do pinguin (em gramas)
* sex: um fator denotando o sexo do pinguin (femea ou macho)

### Dummies

In [31]:
pd.get_dummies(df[['island', 'species']], prefix=['island', 'species'])

Unnamed: 0,island_Biscoe,island_Dream,island_Torgersen,species_Adelie,species_Chinstrap,species_Gentoo
0,0,0,1,1,0,0
1,0,0,1,1,0,0
2,0,0,1,1,0,0
3,0,0,1,1,0,0
4,0,0,1,1,0,0
...,...,...,...,...,...,...
339,1,0,0,0,0,1
340,1,0,0,0,0,1
341,1,0,0,0,0,1
342,1,0,0,0,0,1


In [32]:
pd.get_dummies(df[['island', 'species']], prefix=['island', 'species'], drop_first=True)

Unnamed: 0,island_Dream,island_Torgersen,species_Chinstrap,species_Gentoo
0,0,1,0,0
1,0,1,0,0
2,0,1,0,0
3,0,1,0,0
4,0,1,0,0
...,...,...,...,...
339,0,0,0,1
340,0,0,0,1
341,0,0,0,1
342,0,0,0,1


#### Tratando dados faltantes

In [33]:
df.isnull().sum()

species               0
island                0
bill_length_mm        2
bill_depth_mm         2
flipper_length_mm     2
body_mass_g           2
sex                  11
dtype: int64

In [34]:
df.dropna()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [35]:
df.dropna(axis=1)

Unnamed: 0,species,island
0,Adelie,Torgersen
1,Adelie,Torgersen
2,Adelie,Torgersen
3,Adelie,Torgersen
4,Adelie,Torgersen
...,...,...
339,Gentoo,Biscoe
340,Gentoo,Biscoe
341,Gentoo,Biscoe
342,Gentoo,Biscoe


In [36]:
df.fillna(df.mean())

  df.fillna(df.mean())


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10000,18.70000,181.000000,3750.000000,Male
1,Adelie,Torgersen,39.50000,17.40000,186.000000,3800.000000,Female
2,Adelie,Torgersen,40.30000,18.00000,195.000000,3250.000000,Female
3,Adelie,Torgersen,43.92193,17.15117,200.915205,4201.754386,
4,Adelie,Torgersen,36.70000,19.30000,193.000000,3450.000000,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,43.92193,17.15117,200.915205,4201.754386,
340,Gentoo,Biscoe,46.80000,14.30000,215.000000,4850.000000,Female
341,Gentoo,Biscoe,50.40000,15.70000,222.000000,5750.000000,Male
342,Gentoo,Biscoe,45.20000,14.80000,212.000000,5200.000000,Female


In [37]:
df.fillna(df.mode().squeeze())

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,41.1,17.0,190.0,3800.0,Male
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,41.1,17.0,190.0,3800.0,Male
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [38]:
df.fillna(df.median())

  df.fillna(df.median())


Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.10,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.50,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.30,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,44.45,17.3,197.0,4050.0,
4,Adelie,Torgersen,36.70,19.3,193.0,3450.0,Female
...,...,...,...,...,...,...,...
339,Gentoo,Biscoe,44.45,17.3,197.0,4050.0,
340,Gentoo,Biscoe,46.80,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.40,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.20,14.8,212.0,5200.0,Female


In [39]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.tree import DecisionTreeRegressor
imputer = IterativeImputer(estimator=DecisionTreeRegressor)
df = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

TypeError: Cannot clone object. You should provide an instance of scikit-learn estimator instead of a class.

In [40]:
from scipy import stats
outliers_zscore = df[np.abs(stats.zscore(df)) > 3]

TypeError: can only concatenate str (not "float") to str

In [48]:
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
IQR = Q3 - Q1
outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))
outliers_iqr = df[outliers.any(axis=1)]

  outliers = (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))


In [42]:
outliers_iqr

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex


In [47]:
(df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))

  (df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))


Unnamed: 0,bill_depth_mm,bill_length_mm,body_mass_g,flipper_length_mm,island,sex,species
0,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...
339,False,False,False,False,False,False,False
340,False,False,False,False,False,False,False
341,False,False,False,False,False,False,False
342,False,False,False,False,False,False,False
