In [2]:
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_squared_error
from math import sqrt
import random
import numpy as np
random.seed(0)

# Fetching dataset
import pandas as pd

In [3]:
dataset = fetch_california_housing()
housing_data = pd.DataFrame(dataset.data)
housing_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25


In [4]:
housing_data.shape

(20640, 8)

# Dealing with missing data

In [5]:
train, target = pd.DataFrame(dataset.data), pd.DataFrame(dataset.target)
train.columns = ['zero','one','two','three','four','five','six','seven']
train.insert(loc=len(train.columns), column='target', value=target)

# Randomly replace 30% of data of first column with NaN values

column = train['zero']
missing_pct = int(column.size*0.3)
i = [random.choice(range(column.shape[0])) for _ in range(missing_pct)]
column[i] = np.NaN
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [11]:
# Remove observations that have missing values
# Will drop all rows that have any missing values

train.dropna(inplace=True)
train

(15275, 9)

In [6]:
# Replace all NaN values with the mean of that feature over the whole dataset

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='mean')
imputer = imputer.fit(train[['zero']])
train['zero'] = imputer.transform(train[['zero']]).ravel()
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.32520,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.30140,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,3.87695,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.64310,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.87695,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.56030,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.55680,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.70000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,3.87695,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [9]:
# Replace all NaN values with the median of the feature

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='median')
imputer = imputer.fit(train[['zero']])
train['zero'] = imputer.transform(train[['zero']]).ravel()
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,3.5559,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,3.5559,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,3.5559,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.5559,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,3.5559,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [11]:
# Replace all NaN values with the most frequent of the feature

from sklearn.impute import SimpleImputer
imputer = SimpleImputer(missing_values=np.NaN, strategy='most_frequent')
imputer = imputer.fit(train[['zero']])
train['zero'] = imputer.transform(train[['zero']]).ravel()
train

Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,2.1250,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,2.1250,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,2.1250,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,2.1250,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,2.1250,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,2.1250,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [6]:
# Impute the values using scikit-learn KNNImputer Class
# Install the KNNImputer pip package in the current jupyter kernel
import sys
import sklearn.neighbors._base
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install missingpy
sys.modules['sklearn.neighbors.base'] = sklearn.neighbors._base
from missingpy import KNNImputer
# Replace all NaN values using K-Nearest Neighbors
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(train[['zero']])
train['zero'] = imputer.transform(train[['zero']]).ravel()
train

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com
Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com




Unnamed: 0,zero,one,two,three,four,five,six,seven,target
0,8.32520,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.30140,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,3.87695,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.64310,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.87695,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.56030,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.55680,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.70000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,3.87695,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
# Other ways of handling Missing data:
    # Regression model
        # Predictors of the variable with missing values identified via correlation matrix
        # Best predictors are selected and used as independent variables in a regression equation
        # Variable with missing data is used as the target variable
    # Deep learning
        # Works very well with categorical and non-numerical features
    # Interpolation/Extrapolation
        # Estimate values from other observations within the range of a discrete set of known data points
    # Forward filling/Backward filling
        # Fill the missing value by filling it from the preceding value or the succeeding value
    # Hot deck imputation
        # Randomly choosing the missing value from a set of related and similar variables