# Aula 12 - Tratamento de Dados Ausentes
---
Hoje veremos os m√©todos:
- `isnull`
- `dropna`
- `fillna`

In [1]:
import pandas as pd
import numpy as np

In [2]:
dados = np.array([[1,      2, np.nan],
                  [4, np.nan, np.nan],
                  [7,      8,      9]])

df = pd.DataFrame(dados, columns='A B C'.split())

In [3]:
df

Unnamed: 0,A,B,C
0,1.0,2.0,
1,4.0,,
2,7.0,8.0,9.0


In [4]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,True
1,False,True,True
2,False,False,False


In [5]:
df['A'].isnull()

0    False
1    False
2    False
Name: A, dtype: bool

In [6]:
df['B'].isnull()

0    False
1     True
2    False
Name: B, dtype: bool

In [7]:
df['B'].isnull().sum()

1

In [8]:
df['C'].isnull().sum()

2

In [9]:
df.dropna()

Unnamed: 0,A,B,C
2,7.0,8.0,9.0


In [10]:
df.dropna(axis=1)

Unnamed: 0,A
0,1.0
1,4.0
2,7.0


In [12]:
df['B'] = df['B'].fillna(0)

In [13]:
df

Unnamed: 0,A,B,C
0,1.0,2.0,
1,4.0,0.0,
2,7.0,8.0,9.0


In [14]:
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,2.0,0.0
1,4.0,0.0,0.0
2,7.0,8.0,9.0


In [19]:
df['B'][1] = np.nan

In [20]:
df

Unnamed: 0,A,B,C
0,1.0,2.0,
1,4.0,,
2,7.0,8.0,9.0


In [22]:
df.fillna(method='bfill')

Unnamed: 0,A,B,C
0,1.0,2.0,9.0
1,4.0,8.0,9.0
2,7.0,8.0,9.0


In [23]:
df = pd.read_csv('datasets/titanic_train.csv')

In [24]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [26]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [27]:
df.shape

(891, 12)

In [28]:
687 / 891

0.7710437710437711

In [29]:
df.drop('Cabin', axis=1, inplace=True)

In [30]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         2
dtype: int64

In [31]:
df['Embarked'].value_counts()

S    644
C    168
Q     77
Name: Embarked, dtype: int64

In [32]:
df['Embarked'].fillna('S', inplace=True)

In [33]:
df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Embarked         0
dtype: int64

In [34]:
177 / 891

0.19865319865319866

In [36]:
media_idade = df['Age'].mean()

In [37]:
df['Age'].fillna(media_idade, inplace=True)

In [38]:
df.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64