## Handling Outliers with Numerical Data

### Part 1: Detecting Outliers

#### Detecting Outliers with EllipticEnvelope

In [1]:
# note: we assume that the data is normally distributed, and we 
# draw an ellipse aroung it. All data outside the ellipse are
# considered outliers

from sklearn.datasets import make_blobs
from sklearn.covariance import EllipticEnvelope

# create dummy data
features, _ = make_blobs(n_samples = 10,
                        n_features = 2,
                        centers =1 ,
                        random_state = 420)
# replace first observation with extreme values
features[0,0] = 10000
features[0,1] = 10000

# create detector
outlier_detector = EllipticEnvelope(contamination = .1)

# fit detector
outlier_detector.fit(features)

# predict outliers
outlier_detector.predict(features)

array([-1,  1,  1,  1,  1,  1,  1,  1,  1,  1])

#### Detecting Outliers with Interquartile range (IQR) 

In [2]:
import numpy as np

def indicies_of_outliers(x):
    q1, q3 = np.percentile(x, [25,75])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 - (iqr * 1.5)
    return np.where((x > upper_bound) | (x<lower_bound))

indicies_of_outliers(features[:, 0])

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int64),)

#### Detecting Outliers using Z-score

In [4]:
# note: we center and scale the data around the 
# zero, and anything too far is considered outliers
# It is not robust bc they are affected by outliers

def detect_outliers_z_score(x):
    threshold = 3
    mean_x = np.mean(x)
    std_x = np.std(x)
    z_scores = [(x - mean_x) / std_x for obs in x]
    return np.where(np.abs(z_scores) > threshold)

detect_outliers_z_score(features[:,0])

(array([], dtype=int64), array([], dtype=int64))

### Handling Outliers 

#### Droping Outliers

In [6]:
import pandas as pd

# Create DataFrame
houses = pd.DataFrame()
houses['Price'] = [534433, 392333, 293222, 4322032]
houses['Bathrooms'] = [2, 3.5, 2, 116]
houses['Square_Feet'] = [1500, 2500, 1500, 48000]

# filter observations
houses[houses['Bathrooms'] < 20]

Unnamed: 0,Price,Bathrooms,Square_Feet
0,534433,2.0,1500
1,392333,3.5,2500
2,293222,2.0,1500


#### Mark observations as outliers to use them as features

In [7]:
houses['Outliers'] = np.where(houses['Bathrooms'] < 20, 0, 1)
houses

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers
0,534433,2.0,1500,0
1,392333,3.5,2500,0
2,293222,2.0,1500,0
3,4322032,116.0,48000,1


#### Transform the feature to dampen the outliers effects 

In [8]:
houses['Log_of_Square_Feet'] = [
    np.log(x) for x in houses['Square_Feet']
]

houses

# note: standardization might not be a appropriate with
# outliers because its mean and variance are influenced by them

Unnamed: 0,Price,Bathrooms,Square_Feet,Outliers,Log_of_Square_Feet
0,534433,2.0,1500,0,7.31322
1,392333,3.5,2500,0,7.824046
2,293222,2.0,1500,0,7.31322
3,4322032,116.0,48000,1,10.778956
