## Handling Missing Numerical Values

####  Deleting Observations with Missing Values

In [2]:
import numpy as np

# Create feature matrix
features = np.array([[1.1, 11.1],
 [2.2, 22.2],
 [3.3, 33.3],
 [4.4, 44.4],
 [np.nan, 55]])

# keep rows with non missing values
features[~np.isnan(features).any(axis=1)]

array([[ 1.1, 11.1],
       [ 2.2, 22.2],
       [ 3.3, 33.3],
       [ 4.4, 44.4]])

####  Deleting Observations with Missing Values with pandas

In [4]:
import pandas as pd

# create dataframe
df = pd.DataFrame(features, columns = ['feat_1', 'feat_2'])

# drop na
df.dropna()

# Note: Deleting Observations might can introduce biases in our 
# data because we remove observations that have unobserved effect.

Unnamed: 0,feat_1,feat_2
0,1.1,11.1
1,2.2,22.2
2,3.3,33.3
3,4.4,44.4


#### Imputing Missing values using k-nearest neighbor

In [7]:
from fancyimpute import KNN
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_blobs

# generate fake values
features, _ = make_blobs(n_samples = 100,
                        n_features =2,
                        random_state = 420)

# Standardize values
scaler = StandardScaler()
standardized_features = scaler.fit_transform(features)

# replace first values with missing values
true_value = standardized_features[0,0]
features[0,0] = np.nan

# Predict missing values
features_knn_imputed = KNN(k=5, verbose = 0)
# features_knn_imputed.complete(standardized_features)

# Compare true and imputed values
# print("True Value:", true_value)
# print("Imputed Value:", features_knn_imputed[0,0])

More: https://towardsdatascience.com/the-use-of-knn-for-missing-values-cf33d935c637#:~:text=KNN%20is%20an%20algorithm%20that,in%20a%20multi%2Ddimensional%20space.&text=The%20assumption%20behind%20using%20KNN,it%2C%20based%20on%20other%20variables.

#### Impute Missing Values with Imputer

In [None]:
from sklearn.impute import SimpleImputer

# create imputer
mean_imputer = SimpleImputer(strategy = 'mean')

# impute values
features_mean_imputed = mean_imputer.fit_transform(features)

# Compare true and imputed values
print("True Value:", true_value)
print("Imputed Value:", features_mean_imputed[0,0])