In [20]:
import pandas as pd
import numpy as np

In [26]:
# Here we create a list dictionary of values for our data frame
people = {
    "FirstName" : ['John', 'Jacob', 'Jennifer', 'Jordan', 'Jaqueline', np.nan, None, 'NA'],
    "LastName": ['Jingle', 'Heimer', 'Schmidt', 'Daniel', 'Fishwall', np.nan, np.nan, 'Missing'],
    "Email": ['johnjingle33@fakeemail.com', 'jacobheimer@fakeemail.com', 'Jennifer32@email.com', 'jdaniel@email.com', 'jaqlfish@email.com', None, np.nan, 'NA'],
    'age' : ['33', '55', '63', '36', None, None, 'Missing', 'Missing']
}

In [27]:
df = pd.DataFrame(people)

In [28]:
df

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
4,Jaqueline,Fishwall,jaqlfish@email.com,
5,,,,
6,,,,Missing
7,,Missing,,Missing


In [29]:
df.dropna()

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
7,,Missing,,Missing


In [30]:
# If we have axis set to index we will drop rows !
# If we have axis set to columns we will drop columns with NAN!
df.dropna(axis='index', how='any')

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
7,,Missing,,Missing


In [31]:
# If you change the how arg to all we then get more rows because there were
# columns which we did not have 
df.dropna(axis='index', how='all')

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
4,Jaqueline,Fishwall,jaqlfish@email.com,
6,,,,Missing
7,,Missing,,Missing


In [33]:
df.dropna(axis='columns', how='any')

0
1
2
3
4
5
6
7


In [35]:
# Here we can pass in multiple columns to the subset
# df.dropna(axis='index', how='all', subset=['LastName','Email'])
df.dropna(axis='index', how='any', subset=['LastName','Email'])

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
4,Jaqueline,Fishwall,jaqlfish@email.com,
7,,Missing,,Missing


In [36]:
# the replace method allows us to replace values based on key value match.
df.replace('NA', np.nan, inplace=True)
df.replace('Missing', np.nan, inplace=True)

In [38]:
df.isna()

Unnamed: 0,FirstName,LastName,Email,age
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,True
5,True,True,True,True
6,True,True,True,True
7,True,True,True,True


In [39]:
df.fillna('MISSING')

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
4,Jaqueline,Fishwall,jaqlfish@email.com,MISSING
5,MISSING,MISSING,MISSING,MISSING
6,MISSING,MISSING,MISSING,MISSING
7,MISSING,MISSING,MISSING,MISSING


In [40]:
df.fillna(0)

Unnamed: 0,FirstName,LastName,Email,age
0,John,Jingle,johnjingle33@fakeemail.com,33
1,Jacob,Heimer,jacobheimer@fakeemail.com,55
2,Jennifer,Schmidt,Jennifer32@email.com,63
3,Jordan,Daniel,jdaniel@email.com,36
4,Jaqueline,Fishwall,jaqlfish@email.com,0
5,0,0,0,0
6,0,0,0,0
7,0,0,0,0


In [41]:
df.dtypes

FirstName    object
LastName     object
Email        object
age          object
dtype: object

In [42]:
# The NAN data type is float under the hood so using the .mean() method does not work!
# we have to use floats methods to concatenate the age column.
df['age'].mean()

TypeError: can only concatenate str (not "int") to str

In [43]:
type(np.nan)

float

In [45]:
# The below won't work because the nan type is float not int.
# We could replace all the nan as 0 but this would effect the average.
# Here we just cast the column to a float type.
df['age'] = df['age'].astype(float)

In [46]:
df.dtypes

FirstName     object
LastName      object
Email         object
age          float64
dtype: object

In [48]:
df['age'].mean()

46.75

In [None]:
# You can use the below method to change every type 
# data within the df to a float type.
# df.astype()