In [1]:
import pandas as pd
import numpy as np
from numpy import nan as NaN
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
data = pd.DataFrame([[12, 'man', '13980738665'], [19, 'woman', NaN], [17, NaN, NaN], [NaN, NaN, NaN]], columns = ['age', 'gender', 'phone'])
data.head()

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,
3,,,


# Delete Rows which have NaN 

## Delete Rows with ALL NaN

In [3]:
data.dropna(axis = 0, how = 'all')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,


## Delete Rows with ANY NaN

In [4]:
data.dropna(axis = 0, how = 'any')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665


#  Delete Columns which have NaN

## Delete Columns with ALL NaN

In [5]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,
3,,,


In [6]:
data.dropna(axis = 'columns', how = 'all')

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665.0
1,19.0,woman,
2,17.0,,
3,,,


## Delete Columns with ANY NaN

In [7]:
data.dropna(axis = 1, how = 'any')

0
1
2
3


# Fill NaN


In [8]:
data.fillna(0)

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665
1,19.0,woman,0
2,17.0,0,0
3,0.0,0,0


In [9]:
data.dtypes

age       float64
gender     object
phone      object
dtype: object

In [10]:
data.fillna({'gender': 'man', 'phone': 13000000000})

Unnamed: 0,age,gender,phone
0,12.0,man,13980738665
1,19.0,woman,13000000000
2,17.0,man,13000000000
3,,man,13000000000


#  Find Rows with NaN

In [11]:
n = np.arange(20, dtype = float).reshape(5,4)
n[2,3] = NaN
n[3,3] = NaN
idx = ['r1', 'r2', 'r3', 'r4', 'r5']
cols = ['a', 'b', 'c', 'd']
data = pd.DataFrame(data = n, index = idx, columns = cols)
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,
r5,16.0,17.0,18.0,19.0


In [12]:
data.isnull()

Unnamed: 0,a,b,c,d
r1,False,False,False,False
r2,False,False,False,False
r3,False,False,False,True
r4,False,False,False,True
r5,False,False,False,False


## Solution 1: Cons: the same line will output two times if multiple values NaN in the line

In [13]:
data[data.isnull().values == True]  # the same line output two times

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [14]:
data['d'].isnull()

r1    False
r2    False
r3     True
r4     True
r5    False
Name: d, dtype: bool

In [15]:
data[data['d'].isnull().values == True]

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [16]:
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,
r5,16.0,17.0,18.0,19.0


## Solution 2: better solution


In [17]:
data.isnull().any()

a    False
b    False
c    False
d     True
dtype: bool

In [18]:
data.isnull().T.any()

r1    False
r2    False
r3     True
r4     True
r5    False
dtype: bool

In [19]:
data[data.isnull().T.any()]

Unnamed: 0,a,b,c,d
r3,8.0,9.0,10.0,
r4,12.0,13.0,14.0,


In [20]:
data[data.isnull().T.any()].index.tolist()    # why not work?

['r3', 'r4']

###  Get the row index with Nan

In [21]:
data['d'].isnull().value_counts()

False    3
True     2
Name: d, dtype: int64

In [22]:
data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [23]:
data['d'] = data['d'].fillna(-1)
data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r3,8.0,9.0,10.0,-1.0
r4,12.0,13.0,14.0,-1.0
r5,16.0,17.0,18.0,19.0


In [24]:
data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [25]:
null_index = data[data['d'] == -1].index.tolist()
null_index

['r3', 'r4']

In [26]:
new_data = data.drop(null_index)
new_data.reset_index()
new_data.head()

Unnamed: 0,a,b,c,d
r1,0.0,1.0,2.0,3.0
r2,4.0,5.0,6.0,7.0
r5,16.0,17.0,18.0,19.0


In [27]:
new_data.dtypes

a    float64
b    float64
c    float64
d    float64
dtype: object

In [28]:
new_data = new_data['d'].astype(float)
new_data.head()

r1     3.0
r2     7.0
r5    19.0
Name: d, dtype: float64