In [1]:
import pandas as pd
import numpy as np
from numpy import nan as NaN
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
data = pd.DataFrame([[12, 'man', '13980738665'], [19, 'woman', NaN], [17, NaN, NaN], [NaN, NaN, NaN]], columns = ['age', 'gender', 'phone'])
data.head()

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,
3,,,


In [3]:
ufo = pd.read_csv('./data/uforeports')
ufo.head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,Ithaca,,TRIANGLE,NY,6/1/1930 22:00
1,Willingboro,,OTHER,NJ,6/30/1930 20:00
2,Holyoke,,OVAL,CO,2/15/1931 14:00
3,Abilene,,DISK,KS,6/1/1931 13:00
4,New York Worlds Fair,,LIGHT,NY,4/18/1933 19:00


In [4]:
ufo.isnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,False,True,False,False,False
18237,False,True,False,False,False
18238,False,True,True,False,False
18239,False,False,False,False,False
18240,False,True,False,False,False


In [5]:
ufo.notnull().tail()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
18236,True,False,True,True,True
18237,True,False,True,True,True
18238,True,False,False,True,True
18239,True,True,True,True,True
18240,True,False,True,True,True


In [6]:
ufo.isnull().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [7]:
ufo.isna().sum()

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [8]:
# count of missing values in *total*
ufo.isna().sum().sum()

18028

In [43]:
# new alias for isnull
ufo.isna().head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,False,True,False,False,False
1,False,True,False,False,False
2,False,True,False,False,False
3,False,True,False,False,False
4,False,True,False,False,False


In [44]:
# new alias for notnull
ufo.notna().head()

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
0,True,False,True,True,True
1,True,False,True,True,True
2,True,False,True,True,True
3,True,False,True,True,True
4,True,False,True,True,True


In [9]:
ufo.isnull().sum(axis = 0)

City                  25
Colors Reported    15359
Shape Reported      2644
State                  0
Time                   0
dtype: int64

In [10]:
ufo[ufo.City.isnull()]    # only show the rows with null City

Unnamed: 0,City,Colors Reported,Shape Reported,State,Time
21,,,,LA,8/15/1943 0:00
22,,,LIGHT,LA,8/15/1943 0:00
204,,,DISK,CA,7/15/1952 12:30
241,,BLUE,DISK,MT,7/4/1953 14:00
613,,,DISK,NV,7/1/1960 12:00
1877,,YELLOW,CIRCLE,AZ,8/15/1969 1:00
2013,,,,NH,8/1/1970 9:30
2546,,,FIREBALL,OH,10/25/1973 23:30
3123,,RED,TRIANGLE,WV,11/25/1975 23:00
4736,,,SPHERE,CA,6/23/1982 23:00


In [11]:
ufo['Shape Reported'].value_counts(dropna= False)

LIGHT        2803
NaN          2644
DISK         2122
TRIANGLE     1889
OTHER        1402
CIRCLE       1365
SPHERE       1054
FIREBALL     1039
OVAL          845
CIGAR         617
FORMATION     434
VARIOUS       333
RECTANGLE     303
CYLINDER      294
CHEVRON       248
DIAMOND       234
EGG           197
FLASH         188
TEARDROP      119
CONE           60
CROSS          36
DELTA           7
ROUND           2
CRESCENT        2
PYRAMID         1
FLARE           1
HEXAGON         1
DOME            1
Name: Shape Reported, dtype: int64

# Delete Rows which have NaN 

## Delete Rows with ALL NaN

In [12]:
data.dropna(axis = 0, how = 'all')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,


In [13]:
# Drop a row in which all of its columns are missing.
ufo.dropna(how = 'all').shape

(18241, 5)

In [14]:
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'all').shape

(18237, 5)

## Delete Rows with ANY NaN

In [15]:
data.dropna(axis = 0, how = 'any')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665


In [16]:
# Drop a row in which any of its columns has missing value.
ufo.dropna(how = 'any').shape    

(2486, 5)

In [17]:
# drop a row in which either City or 'Shape Reported' column is missing.
ufo.dropna(subset = ['City', 'Shape Reported'], how = 'any').shape

(15576, 5)

#  Delete Columns which have NaN

## Delete Columns with ALL NaN

In [18]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,
3,,,


## Delete Columns with ANY NaN

In [19]:
data.dropna(axis = 1, how = 'any')

0
1
2
3


# Fill NaN


In [20]:
data.fillna(0)

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665
1,19.0,woman,0
2,17.0,0,0
3,0.0,0,0


In [21]:
data.dtypes

age       float64
gender     object
phone      object
dtype: object

In [22]:
data.fillna({'gender': 'man', 'phone': 13000000000})

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665
1,19.0,woman,13000000000
2,17.0,man,13000000000
3,,man,13000000000


In [23]:
ufo['Shape Reported'].fillna(value = "VARIOUS", inplace = True)

In [24]:
ufo.isnull().sum()

City                  25
Colors Reported    15359
Shape Reported         0
State                  0
Time                   0
dtype: int64

#  Find Rows with NaN

In [25]:
n = np.arange(20, dtype = float).reshape(5,4)
n[2,3] = NaN
n[3,3] = NaN
idx = ['r1', 'r2', 'r3', 'r4', 'r5']
cols = ['a', 'b', 'c', 'd']
data = pd.DataFrame(data = n, index = idx, columns = cols)
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,
r5,16.0,17.0,18.0,19.0


In [26]:
data.isnull()

Unnamed: 0,a,b,c,d
r1,False,False,False,False
r2,False,False,False,False
r3,False,False,False,True
r4,False,False,False,True
r5,False,False,False,False


## Solution 1: Cons: the same line will output two times if multiple values NaN in the line

In [27]:
data[data.isnull().values == True]  # the same line output two times

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [28]:
data['d'].isnull()

r1    False
r2    False
r3     True
r4     True
r5    False
Name: d, dtype: bool

In [29]:
data[data['d'].isnull().values == True]

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [30]:
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,
r5,16.0,17.0,18.0,19.0


## Solution 2: better solution


In [31]:
data.isnull().any()

a    False
b    False
c    False
d     True
dtype: bool

In [32]:
data.isnull().T.any()

r1    False
r2    False
r3     True
r4     True
r5    False
dtype: bool

In [33]:
data[data.isnull().T.any()]

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [34]:
data[data.isnull().T.any()].index.tolist()    # why not work?

['r3', 'r4']

###  Get the row index with Nan

In [35]:
data['d'].isnull().value_counts()

False    3
True     2
Name: d, dtype: int64

In [36]:
data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [37]:
data['d'] = data['d'].fillna(-1)
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,-1.0
r4,12.0,13.0,14.0,-1.0
r5,16.0,17.0,18.0,19.0


In [38]:
data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [39]:
null_index = data[data['d'] == -1].index.tolist()
null_index

['r3', 'r4']

In [40]:
new_data = data.drop(null_index)
new_data.reset_index()
new_data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r5,16.0,17.0,18.0,19.0


In [41]:
new_data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [42]:
new_data = new_data['d'].astype(float)
new_data.head()

r1     3.0
r2     7.0
r5    19.0
Name: d, dtype: float64