In [1]:
import pandas as pd
from io import StringIO

import sys

## Step1:Read csv file as pandas df

In [2]:
csv_data = '''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


## Step2: Check number of missing values

In [3]:
missing_values = df.isnull().sum()
missing_values

A    0
B    0
C    1
D    1
dtype: int64

## Step 3: access the underlying NumPy array via the values attribute

In [4]:
numpy_arr = df.values
numpy_arr

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

## Step 4: Remove rows from df that contain missing values

In [5]:
drop_row = df.dropna(axis=0)
drop_row

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


## Step 5: Remove columns from df that contain missing values

In [6]:
drop_col = df.dropna(axis=1)
drop_col

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


## Step 6: Only drop rows where all columns are NaN

In [7]:
drop_all = df.dropna(how='all')
drop_all

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [8]:
csv_data2 = '''A,B,C,D
1.0,2.0,3.0,
5.0,6.0,,
,,,'''

df2 = pd.read_csv(StringIO(csv_data2))
df2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,6.0,,
2,,,,


In [9]:
drop_all2 = df2.dropna(how='all')
drop_all2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,6.0,,


## Step 7: Drop rows that have less than 3 real values

In [10]:
drop_thresh = df.dropna(thresh=3)
drop_thresh

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [11]:
csv_data3 = '''A,B,C,D
1.0,2.0,3.0,
5.0,6.0,,
10.0,11.0,12.0,'''

df3 = pd.read_csv(StringIO(csv_data3))
df3

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
1,5.0,6.0,,
2,10.0,11.0,12.0,


In [12]:
drop_thresh2 = df3.dropna(thresh=3)
drop_thresh2

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,
2,10.0,11.0,12.0,


## Step 8: Only drop rows where NaN appear in specific columns (here: 'C')

In [13]:
# if you only want a certain columnn to have no nulls, say column 'C'
drop_subset = df.dropna(subset=['C']) 
drop_subset

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## B. Imputing missing values
### Step 1: impute missing values via the column mean

In [14]:
from sklearn.impute import SimpleImputer
import numpy as np

In [15]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')

imputer.fit(df.values)

imputed = pd.DataFrame(imputer.transform(df.values), columns=df.columns)

imputed

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,7.5,8.0
2,10.0,11.0,12.0,6.0
