<div style="color:#006666; padding:0px 10px; border-radius:5px; font-size:18px; text-align:center"><h1 style='margin:10px 5px'>Missing Data</h1>
<hr>
<p style="color:#006666; text-align:right;font-size:10px">
Copyright by MachineLearningPlus. All Rights Reserved.
</p>

</div>

In Python, anything that is missing is represented as `None`. 

In NumPy, since we are dealing specifically with data we use a more data specific notation for missing values: `np.nan`. Infinity is represented as `np.inf`

In [1]:
import numpy as np

In [2]:
np.nan

nan

Care when doing comparison with missing values (`np.nan`)

In [3]:
np.nan == np.nan

False

In [4]:
np.nan in [np.nan]

True

In [5]:
np.nan is np.nan

True

<div class="alert alert-info" style="background-color:#006666; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:7px 5px; font-size:16px'>Import Data with missing value</h2>
</div>

In [6]:
data = np.genfromtxt('Datasets/data_miss.txt', delimiter="\t")
data

array([[ 1.     , 87.     , 57.54435],
       [ 2.     ,  8.     ,  7.31704],
       [ 3.     , 56.     , 56.82095],
       [ 4.     , 63.     , 64.15579],
       [ 5.     ,  2.     ,  5.74522],
       [ 6.     , 45.     , 19.56758],
       [ 7.     , 43.     , 39.62271],
       [ 8.     , 47.     , 34.95107],
       [ 9.     ,  2.     ,      nan],
       [10.     , 79.     , 36.41022],
       [11.     , 67.     , 49.83894],
       [12.     , 24.     ,      inf],
       [13.     , 61.     , 72.55357],
       [14.     , 85.     , 39.24693],
       [15.     , 63.     , 53.6279 ],
       [16.     ,  2.     , 16.72441],
       [17.     , 29.     ,      nan],
       [18.     , 45.     , 18.78498],
       [19.     , 33.     , 19.8089 ],
       [20.     , 28.     , 46.03384],
       [21.     , 21.     , 23.7864 ],
       [22.     , 27.     , 44.42627],
       [23.     , 65.     , 34.94804],
       [24.     , 61.     , 53.49576],
       [25.     , 10.     , 25.98564]])

<div class="alert alert-info" style="background-color:#006666; color:white; padding:0px 10px; border-radius:5px;"><h2 style='margin:7px 5px; font-size:16px'>Check for missing data in array</h2>
</div>

In [7]:
np.isnan(data)

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

__Check for infinity__

In [8]:
np.isinf(data)

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

__Missing or Infinity__

In [9]:
np.isnan(data) | np.isinf(data)

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

__Fill up missing or infinte with some value__

In [10]:
data[np.isnan(data) | np.isinf(data)] = 0

In [11]:
data

array([[ 1.     , 87.     , 57.54435],
       [ 2.     ,  8.     ,  7.31704],
       [ 3.     , 56.     , 56.82095],
       [ 4.     , 63.     , 64.15579],
       [ 5.     ,  2.     ,  5.74522],
       [ 6.     , 45.     , 19.56758],
       [ 7.     , 43.     , 39.62271],
       [ 8.     , 47.     , 34.95107],
       [ 9.     ,  2.     ,  0.     ],
       [10.     , 79.     , 36.41022],
       [11.     , 67.     , 49.83894],
       [12.     , 24.     ,  0.     ],
       [13.     , 61.     , 72.55357],
       [14.     , 85.     , 39.24693],
       [15.     , 63.     , 53.6279 ],
       [16.     ,  2.     , 16.72441],
       [17.     , 29.     ,  0.     ],
       [18.     , 45.     , 18.78498],
       [19.     , 33.     , 19.8089 ],
       [20.     , 28.     , 46.03384],
       [21.     , 21.     , 23.7864 ],
       [22.     , 27.     , 44.42627],
       [23.     , 65.     , 34.94804],
       [24.     , 61.     , 53.49576],
       [25.     , 10.     , 25.98564]])