In [None]:
import pandas as pd
import numpy as np
from io import StringIO

# Create a small dataset to analyze

csv_data = """
A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
"""

# read the CSV data into a pandas DataFrame
df = pd.read_csv(StringIO(csv_data))

# show the dataset
def show_dataset(df):
    return df.style.background_gradient(cmap='Greens').highlight_null('orange').format("{:.2f}")

show_dataset(df)

# Removing missing values

In [None]:
# =============================================================================
# Identify columns containg missing values
# =============================================================================
df.isnull().sum()


In [None]:
# ===========================================
# Access numpy array from pandas data frame
# ============================================
# access the underlying NumPy array
# via the `values` attribute
df_array = df.values
print(df_array)

In [None]:
# show initial dataset again
show_dataset(df)

In [None]:
# ===========================================
# (A) Remove rows that contain missing values
# ============================================
a = df.dropna(axis=0)

# show the dataset
show_dataset(a)

In [None]:
# ===========================================
# (B) Remove columns that contain missing values
# ============================================
b = df.dropna(axis=1)

# show the dataset
show_dataset(b)

In [None]:
# make one sample completely empty
modified_df = df.copy()
modified_df.loc[2] = np.nan
show_dataset(modified_df)

In [None]:
# ===========================================
# (C) Remove row where all columns are NaN
# ============================================
c = modified_df.dropna(how='all')

# show the dataset
show_dataset(c)

In [None]:
# ===========================================
# (D) Remove rows that have less than 4 real values
# ============================================
d = df.dropna(thresh=4)

# show the dataset
show_dataset(d)

In [None]:
# ===========================================
# (E) Only drop rows where NaN appear in specific columns (here: 'C')
# ============================================
f = df.dropna(subset=['C'])

# show the dataset
show_dataset(f)

# Imputing missing values

In [None]:
show_dataset(df)

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
# Impute missing values using the column mean
imputer = SimpleImputer(missing_values=np.nan, strategy='mean') # other popular choices: "median", "most_frequent"
imputer.fit(df.values)
imputed_data = imputer.transform(df.values)

# show the dataset
# note that the output of the SimpleImputer is a NumPy array
# so we need to convert it back to a pandas DataFrame to use our helper function
show_dataset(pd.DataFrame(imputed_data))