**Types of Missing Data**
- MCAR (Missing completely at random)
- MAR (Missing at random)
- MNAR (Missing Not At Random)

## Removing missing data

In [13]:
# Drop columns with missing data
import numpy as np
import pandas as pd
from IPython.display import display, HTML
data = np.array([
    [1, 2, np.nan, 4],
    [5, np.nan, 7, 8],
    [9, 10, 11, 12]]
    )
df = pd.DataFrame(data)
display(df)


# Drop columns with any NaN values
mask = ~np.isnan(data).any(axis=0)
data_cleaned = data[:, mask]

print(data_cleaned)

Unnamed: 0,0,1,2,3
0,1.0,2.0,,4.0
1,5.0,,7.0,8.0
2,9.0,10.0,11.0,12.0


[[ 1.  4.]
 [ 5.  8.]
 [ 9. 12.]]


In [14]:
# Drop rows with missing data
import numpy as np
data = np.array([
    [1, 2, np.nan, 4],
    [5, np.nan, 7, 8],
    [9, 10, 11, 12]]
    )

# Drop rows with any NaN values
mask = ~np.isnan(data).any(axis=1)
data_cleaned = data[mask]

print(data_cleaned)

[[ 9. 10. 11. 12.]]


## Imputation

In [15]:
# Setup
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris

# Load iris dataset
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Introduce missing values randomly (10% missing in 'sepal length (cm)')
np.random.seed(42)
mask = np.random.rand(len(data)) < 0.1
data.loc[mask, 'sepal length (cm)'] = np.nan

print("Dataset with missing values:")
print(data.head(10))

Dataset with missing values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
5                5.4               3.9                1.7               0.4
6                NaN               3.4                1.4               0.3
7                5.0               3.4                1.5               0.2
8                4.4               2.9                1.4               0.2
9                4.9               3.1                1.5               0.1


In [16]:
from sklearn.impute import SimpleImputer

# Mean imputation for 'sepal length (cm)'
imputer = SimpleImputer(strategy='mean')
data_mean_imputed = data.copy()
data_mean_imputed['sepal length (cm)'] = imputer.fit_transform(data[['sepal length (cm)']])
print("After mean imputation:")
print(data_mean_imputed.head(10))

After mean imputation:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           5.100000               3.5                1.4               0.2
1           4.900000               3.0                1.4               0.2
2           4.700000               3.2                1.3               0.2
3           4.600000               3.1                1.5               0.2
4           5.000000               3.6                1.4               0.2
5           5.400000               3.9                1.7               0.4
6           5.843182               3.4                1.4               0.3
7           5.000000               3.4                1.5               0.2
8           4.400000               2.9                1.4               0.2
9           4.900000               3.1                1.5               0.1


In [17]:
# Fill missing values with a constant
imputer = SimpleImputer(strategy='constant', fill_value=0)
data_const_imputed = data.copy()
data_const_imputed['sepal length (cm)'] = imputer.fit_transform(data[['sepal length (cm)']])
print("After constant imputation (0):")
print(data_const_imputed.head(10))

After constant imputation (0):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
5                5.4               3.9                1.7               0.4
6                0.0               3.4                1.4               0.3
7                5.0               3.4                1.5               0.2
8                4.4               2.9                1.4               0.2
9                4.9               3.1                1.5               0.1


## Model based imputation
### Regression Imputation
Use a regression model to predict missing values

In [None]:
# Load iris dataset
iris = load_iris()
data = pd.DataFrame(data=iris.data, columns=iris.feature_names)

# Introduce missing values randomly (10% missing in 'sepal length (cm)')
np.random.seed(42)
mask = np.random.rand(len(data)) < 0.1
data.loc[mask, 'sepal length (cm)'] = np.nan

print("Dataset with missing values:")
print(data.head(10))

# Split data into complete and missing
complete_data = data.dropna() 
missing_data = data[data['sepal length (cm)'].isna()]

# Train regression model
X_train = complete_data.drop(columns=['sepal length (cm)'])
y_train = complete_data['sepal length (cm)']
model = LinearRegression()
model.fit(X_train, y_train)

# Predict missing values
X_missing = missing_data.drop(columns=['sepal length (cm)'])
predicted_values = model.predict(X_missing)

# Fill in predictions
data_regression_imputed = data.copy()
data_regression_imputed.loc[data['sepal length (cm)'].isna(), 'sepal length (cm)'] = predicted_values
print("After regression imputation:")
print(data_regression_imputed.head(10))

Dataset with missing values:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2
5                5.4               3.9                1.7               0.4
6                NaN               3.4                1.4               0.3
7                5.0               3.4                1.5               0.2
8                4.4               2.9                1.4               0.2
9                4.9               3.1                1.5               0.1
After regression imputation:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0           5.100000          