In [39]:
import pandas as pd
import numpy as np

In [40]:
people = {
    "first": ["Anakin", "Jane", "John", "Luke", np.nan, None, "NA"],
    "last": ["SkyWalker", "Doe", "Doe", "SkyWalker", np.nan, np.nan, "Missing"],
    "email": [
        "AnakinMSkyWalker@gmail.com",
        "JaneDoe@email.com",
        "JohnDoe@email.com",
        None,
        np.nan,
        "Anonymous@email.com",
        "NA",
    ],
    "age": ["33", "55", "63", "36", None, None, "Missing"],
}

In [41]:
df = pd.DataFrame(people)
df.replace("NA", np.nan, inplace=True)
df.replace("Missing", np.nan, inplace=True)
df

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Luke,SkyWalker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [42]:
df.dropna()

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [43]:
df.dropna(axis="index", how="any")  # default args

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63


In [44]:
df.dropna(
    axis="index", how="all"
)  # will drop rows when all columns are Nan for this row

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Luke,SkyWalker,,36.0
5,,,Anonymous@email.com,


In [45]:
df.dropna(axis="columns", how="any")

0
1
2
3
4
5
6


In [46]:
df.dropna(axis="index", subset=["email"])

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
5,,,Anonymous@email.com,


In [47]:
df.dropna(subset=["email", "last"], how="all")  # should have eihther email or last

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Luke,SkyWalker,,36.0
5,,,Anonymous@email.com,


In [49]:
df.isna()

Unnamed: 0,first,last,email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,True,False
4,True,True,True,True
5,True,True,False,True
6,True,True,True,True


In [50]:
df.fillna(0)  # use it with inplace

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33
1,Jane,Doe,JaneDoe@email.com,55
2,John,Doe,JohnDoe@email.com,63
3,Luke,SkyWalker,0,36
4,0,0,0,0
5,0,0,Anonymous@email.com,0
6,0,0,0,0


In [51]:
df['age'].fillna(0)

0    33
1    55
2    63
3    36
4     0
5     0
6     0
Name: age, dtype: object

In [53]:
df.dtypes

first    object
last     object
email    object
age      object
dtype: object

In [57]:
df["age"] = df["age"].astype(float)

In [58]:
df.dtypes

first     object
last      object
email     object
age      float64
dtype: object

In [59]:
df

Unnamed: 0,first,last,email,age
0,Anakin,SkyWalker,AnakinMSkyWalker@gmail.com,33.0
1,Jane,Doe,JaneDoe@email.com,55.0
2,John,Doe,JohnDoe@email.com,63.0
3,Luke,SkyWalker,,36.0
4,,,,
5,,,Anonymous@email.com,
6,,,,


In [60]:
df['age'].mean()

46.75