<a href="https://colab.research.google.com/github/thunguyen177/tutorials/blob/main/missing%20data%20imputation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## libraries and function 


In [2]:
# !pip install impyute
from sklearn import datasets
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from scipy import stats
import numpy as np
# import impyute as impy
# from fancyimpute import IterativeSVD, SoftImpute, NuclearNormMinimization
import pandas as pd
import time

In [None]:
Xstar = np.array([[168, 30, 200],[200, 45, 300], [180, 90, np.nan], [129, 70, np.nan]])
Xstar

array([[168.,  30., 200.],
       [200.,  45., 300.],
       [180.,  90.,  nan],
       [129.,  70.,  nan]])

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit(Xstar)
print(imputer.transform(Xstar))

[[168.  30. 200.]
 [200.  45. 300.]
 [180.  90. 250.]
 [129.  70. 250.]]


In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
imputer.fit_transform(Xstar)

array([[168.,  30., 200.],
       [200.,  45., 300.],
       [180.,  90., 250.],
       [129.,  70., 250.]])

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='median')
imputer.fit_transform(Xstar)

array([[168.,  30., 200.],
       [200.,  45., 300.],
       [180.,  90., 250.],
       [129.,  70., 250.]])

In [None]:
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer.fit_transform(Xstar)

array([[168.,  30., 200.],
       [200.,  45., 300.],
       [180.,  90., 200.],
       [129.,  70., 200.]])

## Data imputation with fancyimpute package

**SoftImpute** paper:     "Spectral Regularization Algorithms for Learning Large Incomplete Matrices"    by Mazumder, Hastie, and Tibshirani.

**Biscaler**: Iterative estimation of row and column centering/scaling using the algorithm from page 31 of: Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares

In [None]:
!pip install fancyimpute
from fancyimpute import KNN, NuclearNormMinimization, SoftImpute, BiScaler 

In [None]:
X = KNN(k=1).fit_transform(Xstar)
print(X,'\n\n',Xstar)

Imputing row 1/4 with 0 missing, elapsed time: 0.001
[[168.  30. 200.]
 [200.  45. 300.]
 [180.  90. 300.]
 [129.  70. 200.]] 

 [[168.  30. 200.]
 [200.  45. 300.]
 [180.  90.  nan]
 [129.  70.  nan]]


In [None]:
# induce sparsity using singular value thresholding
X_incomplete_normalized = BiScaler().fit_transform(Xstar)
X = SoftImpute().fit_transform(X_incomplete_normalized)
print(X,'\n\n',X_incomplete_normalized)

[BiScaler] Initial log residual value = 12.065274
[BiScaler] Iter 1: log residual = -2.899786, log improvement ratio=14.965060
[BiScaler] Iter 2: log residual = -4.226582, log improvement ratio=1.326796
[BiScaler] Iter 3: log residual = -4.788862, log improvement ratio=0.562280
[BiScaler] Iter 4: log residual = -4.943716, log improvement ratio=0.154854
[BiScaler] Iter 5: log residual = -5.003397, log improvement ratio=0.059680
[BiScaler] Iter 6: log residual = -5.019292, log improvement ratio=0.015896
[BiScaler] Iter 7: log residual = -5.021997, log improvement ratio=0.002705
[BiScaler] Iter 8: log residual = -5.019859, log improvement ratio=-0.002139
[SoftImpute] Max Singular Value of X_init = 2.663978
[SoftImpute] Iter 1: observed MAE=0.021302 rank=2
[SoftImpute] Iter 2: observed MAE=0.021302 rank=2
[SoftImpute] Iter 3: observed MAE=0.021302 rank=2
[SoftImpute] Iter 4: observed MAE=0.021302 rank=2
[SoftImpute] Iter 5: observed MAE=0.021302 rank=2
[SoftImpute] Iter 6: observed MAE=0.0

In [None]:
# induce sparsity using singular value thresholding
X = SoftImpute(verbose = False).fit_transform(Xstar)
print(X,'\n\n',Xstar)

[[168.          30.         200.        ]
 [200.          45.         300.        ]
 [180.          90.         134.17830118]
 [129.          70.          90.70721687]] 

 [[168.  30. 200.]
 [200.  45. 300.]
 [180.  90.  nan]
 [129.  70.  nan]]


## MissForest
Stekhoven, Daniel J., and Peter Bühlmann. "MissForest—non-parametric missing value imputation for mixed-type data." Bioinformatics 28.1 (2011): 112-118.

In [19]:
!pip install missingpy
# if ModuleNotFoundError: No module named 'sklearn.neighbors.base', use the following 3 lines
import sys
import sklearn.neighbors._base
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base

from missingpy import MissForest
# note that MissForest uses sklearn.__version__ 0.22.2.post1


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting missingpy
  Downloading missingpy-0.2.0-py3-none-any.whl (49 kB)
[K     |████████████████████████████████| 49 kB 2.6 MB/s 
[?25hInstalling collected packages: missingpy
Successfully installed missingpy-0.2.0


In [None]:
X = MissForest(random_state=0).fit_transform(Xstar)
print(X,'\n\n',Xstar)



Iteration: 0




Iteration: 1
[[168.  30. 200.]
 [200.  45. 300.]
 [180.  90. 250.]
 [129.  70. 250.]] 

 [[168.  30. 200.]
 [200.  45. 300.]
 [180.  90.  nan]
 [129.  70.  nan]]


In [None]:
imputer = MissForest(random_state=0)
imputer.fit(Xstar)
X = imputer.transform(Xstar)
print(X,'\n\n',Xstar)



Iteration: 0




Iteration: 1
[[168.  30. 200.]
 [200.  45. 300.]
 [180.  90. 250.]
 [129.  70. 250.]] 

 [[168.  30. 200.]
 [200.  45. 300.]
 [180.  90.  nan]
 [129.  70.  nan]]


# Categorical missing data 

In [39]:
def generate_nan(Xtrain, missing_rate):
  Xshape = Xtrain.shape
  na_id = np.random.randint(0,Xtrain.size,round(missing_rate*Xtrain.size))
  Xtr_nan = Xtrain.flatten()
  Xtr_nan[na_id] = np.nan 
  return Xtr_nan.reshape(Xshape)

def generate_nan_col(X, non_missing_cols = None, missing_rate = 0.2):
    X_non_missing = X[:, non_missing_cols]
    X_missing = X[:, [i for i in range(X.shape[1]) if i not in non_missing_cols]]
    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)
    X_nan = np.hstack((X_non_missing, X_nan))
    return X_nan  

In [34]:
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/cmc/cmc.data',
                  sep = ",", header = None)
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,24,2,3,3,1,1,2,3,0,1
1,45,1,3,10,1,1,3,4,0,1
2,43,2,3,7,1,1,3,4,0,1
3,42,3,2,9,1,1,3,3,0,1
4,36,3,3,8,1,1,3,2,0,1


In [35]:
data = data.to_numpy()
X,y = data[:,:9].astype(np.float32),data[:,-1]
X[:3]

array([[24.,  2.,  3.,  3.,  1.,  1.,  2.,  3.,  0.],
       [45.,  1.,  3., 10.,  1.,  1.,  3.,  4.,  0.],
       [43.,  2.,  3.,  7.,  1.,  1.,  3.,  4.,  0.]], dtype=float32)

In [36]:
Xstar = generate_nan(X, 0.5)
Xstar[:3]

array([[nan,  2.,  3., nan, nan, nan,  2.,  3.,  0.],
       [nan,  1.,  3., nan,  1.,  1., nan, nan, nan],
       [43.,  2.,  3.,  7.,  1., nan, nan,  4.,  0.]], dtype=float32)

**mode** imputation:

In [37]:
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent') 
imputer.fit_transform(Xstar)

array([[25.,  2.,  3., ...,  2.,  3.,  0.],
       [25.,  1.,  3., ...,  3.,  4.,  0.],
       [43.,  2.,  3., ...,  3.,  4.,  0.],
       ...,
       [25.,  3.,  4., ...,  1.,  4.,  0.],
       [33.,  4.,  3., ...,  2.,  2.,  0.],
       [25.,  3.,  4., ...,  3.,  4.,  0.]], dtype=float32)

**logistic regression**

In [41]:
Xstar = generate_nan_col(X, np.arange(5), missing_rate=.3)
Xstar[:3]

array([[24.,  2.,  3.,  3.,  1., nan,  2., nan,  0.],
       [45.,  1.,  3., 10.,  1.,  1., nan,  4.,  0.],
       [43.,  2.,  3.,  7.,  1.,  1.,  3., nan, nan]], dtype=float32)

In [67]:
from sklearn.linear_model import LogisticRegression
ymiss = Xstar[:,5].flatten()
non_missing_id = np.where(~np.isnan(ymiss))
missing_id = np.where(np.isnan(ymiss))
Xinp = Xstar[non_missing_id,:5].reshape((-1,5))
clf = LogisticRegression(random_state=0).fit(Xinp, Xstar[non_missing_id,5].flatten())
Xstar[missing_id,5] = clf.predict(Xstar[missing_id, :5].reshape((-1,5)))

In [68]:
Xstar[:3]

array([[24.,  2.,  3.,  3.,  1.,  1.,  2., nan,  0.],
       [45.,  1.,  3., 10.,  1.,  1., nan,  4.,  0.],
       [43.,  2.,  3.,  7.,  1.,  1.,  3., nan, nan]], dtype=float32)

In [None]:
ymiss = Xstar[:,5].flatten()
non_missing_id = np.where(~np.isnan(ymiss))
missing_id = np.where(np.isnan(ymiss))
Xinp = Xstar[non_missing_id,:5].reshape((-1,5))
clf = LogisticRegression(random_state=0).fit(Xinp, Xstar[non_missing_id,5].flatten())
Xstar[missing_id,5] = clf.predict(Xstar[missing_id, :5].reshape((-1,5)))

In [71]:
Xstar = generate_nan(X, missing_rate=.3)
Xstar[:3]

array([[nan, nan,  3.,  3.,  1.,  1.,  2.,  3.,  0.],
       [45.,  1.,  3., 10.,  1.,  1.,  3., nan,  0.],
       [43., nan, nan, nan,  1.,  1.,  3.,  4.,  0.]], dtype=float32)

## Iterative Imputer in sklearn

In [None]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(estimator=LogisticRegression(),max_iter=10)
X = imputer.fit_transform(Xstar)

In [79]:
X[:3]

array([[25.,  3.,  3.,  3.,  1.,  1.,  2.,  3.,  0.],
       [45.,  1.,  3., 10.,  1.,  1.,  3.,  3.,  0.],
       [43.,  3.,  4.,  4.,  1.,  1.,  3.,  4.,  0.]], dtype=float32)

# Heart

In [3]:
data = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.train', header = None,sep=',')
test = pd.read_table('https://archive.ics.uci.edu/ml/machine-learning-databases/spect/SPECTF.test',
                     header=None, sep = ',')
data = pd.concat([data, test])
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,35,36,37,38,39,40,41,42,43,44
0,1,59,52,70,67,73,66,72,61,58,...,66,56,62,56,72,62,74,74,64,67
1,1,72,62,69,67,78,82,74,65,69,...,65,71,63,60,69,73,67,71,56,58
2,1,71,62,70,64,67,64,79,65,70,...,73,70,66,65,64,55,61,41,51,46
3,1,69,71,70,78,61,63,67,65,59,...,61,61,66,65,72,73,68,68,59,63
4,1,70,66,61,66,61,58,69,69,72,...,67,69,70,66,70,64,60,55,49,41


In [None]:
data = data.to_numpy()
X,y = data[:,1:], data[:,0]
X = X.astype(np.float32)
G = len(np.unique(y)) 
print(np.shape(X))
for g in range(G):
  print(sum(y==g))

(267, 44)
55
212


In [None]:
Xstar = generate_nan(X, 0.2)
Xstar[:3]

array([[59., 52., 70., nan, 73., 66., 72., 61., 58., 52., 72., 71., 70.,
        77., 66., 65., 67., 55., 61., 57., 68., 66., 72., 74., 63., 64.,
        56., 54., 67., 54., 76., 74., 65., nan, 66., 56., 62., 56., 72.,
        62., 74., 74., nan, 67.],
       [72., 62., 69., 67., 78., 82., 74., 65., 69., 63., 70., 70., 72.,
        74., 70., 71., 72., 75., 66., 65., 73., 78., 74., nan, 74., 69.,
        69., 70., 71., 69., 72., nan, 62., nan, 65., 71., 63., 60., 69.,
        73., 67., 71., nan, 58.],
       [nan, 62., 70., 64., 67., 64., 79., 65., 70., nan, 72., 71., 68.,
        nan, 61., 61., 73., 71., 75., 74., 80., 74., 54., 47., 53., 37.,
        77., 68., 72., 59., 72., 68., nan, 60., 73., nan, 66., 65., 64.,
        nan, nan, nan, nan, 46.]], dtype=float32)