In [29]:
# clean up datasets
import pandas as pd
from io import StringIO

csv_data = \
'''
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
'''
df = pd.read_csv(StringIO(csv_data))
print(df, '\n')
print("List boolean value showing if value is NaN:\n", df.isnull(), '\n')
print("sum of NaN values in a column\n", df.isnull().sum(), '\n')   
# .isnull()
# ---------
# Convert true value into bool value
# indicating that data is losted at any location

      A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN 

List boolean value showing if value is NaN:
        A      B      C      D
0  False  False  False  False
1  False  False   True  False
2  False  False  False   True 

sum of NaN values in a column
 A    0
B    0
C    1
D    1
dtype: int64 



In [30]:
# Method 1: delete empty data
print("delete whole rows that have NaN:\n", df.dropna(axis=0), '\n')
print("delete whole columns that have NaN:\n", df.dropna(axis=1), '\n')
print("delete rows whose values are all NaN:\n", df.dropna(how='all'), '\n')  # in this situation delete nothing
print("delete whole rows that have less than 'thresh' real values:\n", df.dropna(thresh=4), '\n')
print("delete whole rows that designated columns have NaN:\n", df.dropna(subset='D'))
# .dropna()
# ---------
# axis: row or column(0 or 1)
# how: drop NaN by strategy of 'all' etc
# thresh: least real values number
# subset: subsets column

delete whole rows that have NaN:
      A    B    C    D
0  1.0  2.0  3.0  4.0 

delete whole columns that have NaN:
       A     B
0   1.0   2.0
1   5.0   6.0
2  10.0  11.0 

delete rows whose values are all NaN:
       A     B     C    D
0   1.0   2.0   3.0  4.0
1   5.0   6.0   NaN  8.0
2  10.0  11.0  12.0  NaN 

delete whole rows that have less than 'thresh' real values:
      A    B    C    D
0  1.0  2.0  3.0  4.0 

delete whole rows that designated columns have NaN:
      A    B    C    D
0  1.0  2.0  3.0  4.0
1  5.0  6.0  NaN  8.0


In [31]:
# Method 2: fill up empty data
from sklearn.impute import SimpleImputer
import numpy as np
imr = SimpleImputer(missing_values=np.nan, strategy='mean')  # availible: 'mean', 'median', 'most_frequent'
imr = imr.fit(df.values)      # use .values to get numpy data structure
imputed_data = imr.transform(df.values)
print(imputed_data)

[[ 1.   2.   3.   4. ]
 [ 5.   6.   7.5  8. ]
 [10.  11.  12.   6. ]]
