In [1]:
# Missing data is always a problem in real life scenarios
# Let us see how we can handle missing values (NA or NaN) using Pandas

In [2]:
import pandas as pd
import numpy as np

In [4]:
df1 = pd.DataFrame(np.random.randn(5, 3),
                  index=['a', 'c', 'e', 'f', 'h'],
                  columns=['one', 'two', 'three'])
# reindex used to create a DataFrame with missing values
df1 = df1.reindex(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])
print(df1)

        one       two     three
a -0.597408 -0.925170  0.142090
b       NaN       NaN       NaN
c -1.880841  1.929226 -1.787383
d       NaN       NaN       NaN
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
g       NaN       NaN       NaN
h  0.349733 -0.286408  0.313719


In [5]:
# Check for missing values
# isnull()
print(df1['one'].isnull())

a    False
b     True
c    False
d     True
e    False
f    False
g     True
h    False
Name: one, dtype: bool


In [6]:
# notnull()
print(df1['one'].notnull())

a     True
b    False
c     True
d    False
e     True
f     True
g    False
h     True
Name: one, dtype: bool


In [7]:
# calculations with missing data
# when summing, NA will be treated as zero
# if the data are all NA, then the result will be NA
print(df1['one'].sum())

0.36073001594779885


In [9]:
df2 = pd.DataFrame(index=[0, 1, 2, 3, 4, 5], columns=['one', 'two'])
print(df2)
print("SUM of column 'one':")
print(df2['one'].sum())

   one  two
0  NaN  NaN
1  NaN  NaN
2  NaN  NaN
3  NaN  NaN
4  NaN  NaN
5  NaN  NaN
SUM of column 'one':
0


In [12]:
# CLEANING / FILLING MISSING DATA

# replace NaN with a scalar value
df3 = pd.DataFrame(np.random.randn(3, 3),
                  index=['a', 'c', 'e'],
                  columns=['one', 'two', 'three'])
print(df3)
df3 = df3.reindex(['a', 'b', 'c'])
print(df3)
print("NaN replaces with '0':")
print(df3.fillna(0))

        one       two     three
a -0.564644 -0.862081 -1.713962
c  0.559825  2.060316 -0.125762
e -0.467817  0.473137  0.808570
        one       two     three
a -0.564644 -0.862081 -1.713962
b       NaN       NaN       NaN
c  0.559825  2.060316 -0.125762
NaN replaces with '0':
        one       two     three
a -0.564644 -0.862081 -1.713962
b  0.000000  0.000000  0.000000
c  0.559825  2.060316 -0.125762


In [14]:
# fill NA forward and backward

# pad/fill - fill methods forward
# bfill/backfill - fill methods backward

print(df1)
print("Fill methods forward:")
print(df1.fillna(method='pad'))

        one       two     three
a -0.597408 -0.925170  0.142090
b       NaN       NaN       NaN
c -1.880841  1.929226 -1.787383
d       NaN       NaN       NaN
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
g       NaN       NaN       NaN
h  0.349733 -0.286408  0.313719
Fill methods forward:
        one       two     three
a -0.597408 -0.925170  0.142090
b -0.597408 -0.925170  0.142090
c -1.880841  1.929226 -1.787383
d -1.880841  1.929226 -1.787383
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
g  1.139649 -0.138620  0.387304
h  0.349733 -0.286408  0.313719


In [16]:
print("Fill methods backward:")
print(df1.fillna(method='backfill'))

Fill methods backward:
        one       two     three
a -0.597408 -0.925170  0.142090
b -1.880841  1.929226 -1.787383
c -1.880841  1.929226 -1.787383
d  1.349597  0.929139  3.240791
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
g  0.349733 -0.286408  0.313719
h  0.349733 -0.286408  0.313719


In [18]:
# DROP MISSING VALUES
# simply excludes the missing values
# use the dropna function along with the axis argument (default, axis=0, i.e., along row,
# which means that if any value within a row is NA then the whole row is excluded)
print(df1)
print("After dropping missing values along rows:")
print(df1.dropna())

        one       two     three
a -0.597408 -0.925170  0.142090
b       NaN       NaN       NaN
c -1.880841  1.929226 -1.787383
d       NaN       NaN       NaN
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
g       NaN       NaN       NaN
h  0.349733 -0.286408  0.313719
After dropping missing values along rows:
        one       two     three
a -0.597408 -0.925170  0.142090
c -1.880841  1.929226 -1.787383
e  1.349597  0.929139  3.240791
f  1.139649 -0.138620  0.387304
h  0.349733 -0.286408  0.313719


In [19]:
print("After dropping missing values along columns:")
print(df1.dropna(axis=1))

After dropping missing values along columns:
Empty DataFrame
Columns: []
Index: [a, b, c, d, e, f, g, h]


In [20]:
# Replace Missing (or) Generic Values --> replace a generic value with some specific value (using replace function)
df4 = pd.DataFrame({'one': [10, 20, 30, 40, 50, 2000],
                   'two': [1000, 0, 30, 40, 50 ,60]})
print(df4)

print("After replacing the values:")
print(df4.replace({1000:10, 2000:60}))

    one   two
0    10  1000
1    20     0
2    30    30
3    40    40
4    50    50
5  2000    60
After replacing the values:
   one  two
0   10   10
1   20    0
2   30   30
3   40   40
4   50   50
5   60   60
