# Outlier Detection and Removal

In [1]:
import pandas as pd
import numpy as np

#Libraries used for Feature Elimination
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.utils import shuffle
from sklearn.feature_selection import RFECV
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.svm import SVC

#Sampling
from collections import Counter
from imblearn.over_sampling import SMOTE

#Grid Search
from sklearn.experimental import enable_halving_search_cv  
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.model_selection import GridSearchCV

# To ignore warnings
import warnings  

In [None]:
def warn(*args, **kwargs):
    pass

warnings.warn = warn

In [2]:
def removeOutliers(train):
    train2 = train
    index_vals = train2.shape[0]
    index_vals = set(range(index_vals))
    skipped = []
    
    for colname in train2.columns:
        # IQR
        if colname in ['Reading_ID', 'donation_id', 'id', 
                       'hdl_cholesterol_human', 
                       'hemoglobin(hgb)_human', 
                       'cholesterol_ldl_human']:
            #Skip the ID column and Target variables in outlier detection
            continue
            
        #Caluculating quartile values
        Q1 = np.percentile(train2[colname], 25,
                       interpolation = 'midpoint')
        Q3 = np.percentile(train2[colname], 75,
                       interpolation = 'midpoint')
        
        #Calculating IQR values
        IQR = Q3 - Q1

        # Upper bound
        upper = np.where(train2[colname] >= (Q3+1.5*IQR))
        # Lower bound
        lower = np.where(train2[colname] <= (Q1-1.5*IQR))

        up = index_vals.intersection(set(upper[0]))
        index_vals = index_vals-set(upper[0])
        lo = index_vals.intersection(set(lower[0]))
        index_vals = index_vals-set(lower[0])

        try:
            train2.drop(list(up), inplace = True)
            train2.drop(list(lo), inplace = True)
        except:
            #Handling no outliers
            skipped.append(colname)
        
    return train2

In [7]:
def removeOutliersCols(train,cols):
    train2=train
    index_vals=train2.shape[0]
    index_vals=set(range(index_vals))
    skipped=[]
    for colname in cols:
        # IQR
        if colname in ['Reading_ID', 'donation_id', 'id', 
                       'hdl_cholesterol_human', 'hemoglobin(hgb)_human', 
                       'cholesterol_ldl_human']:
            continue
            #Skip the ID column and Target variables in outlier detection
            
        #Caluculating quartile values
        Q1 = np.percentile(train2[colname], 25,
                       interpolation = 'midpoint')
        Q3 = np.percentile(train2[colname], 75,
                       interpolation = 'midpoint')
        
        #Calculating IQR values
        IQR = Q3 - Q1

        # Upper bound
        upper = np.where(train2[colname] >= (Q3+1.5*IQR))
        # Lower bound
        lower = np.where(train2[colname] <= (Q1-1.5*IQR))

        up=index_vals.intersection(set(upper[0]))
        index_vals=index_vals-set(upper[0])
        lo=index_vals.intersection(set(lower[0]))
        index_vals=index_vals-set(lower[0])

        try:
            train2.drop(list(up), inplace = True)
            train2.drop(list(lo), inplace = True)
        except:
            #Handling no outliers
            skipped.append(colname)
        
    return train2

### SMOTE Sampling

In [3]:
#Perform Sampling for each label
def overSampleData(train, labels_n, target):
    trainS = train.iloc[:,0:173].to_numpy()
    # summarize class distribution
    print('Samples per class before oversampling minority class:')
    counter = Counter(labels_n[:,target])
    print(counter)
    
    # transform the dataset
    oversample = SMOTE()
    X, y = oversample.fit_resample(trainS, labels_n[:,0])
    
    # summarize the new class distribution
    print('Samples per class after oversampling minority class:')
    counter = Counter(y)
    print(counter)
    
    return X, y

# Dimenssionality Reduction

### 1. Recursive Feature Elimination

In [4]:
#function to perform RFE and fit Random Forest
def RFE_DimReduction(X, labels_n, targetVal, noFeaturesToSelect, 
                     validation, validationLabels_n, test):
    #targetVal denotes the target which we wish to predit
    # i.e. Hdl_cholesterol_human, Cholesterol_ldl_human, 
    #Hemoglobin(hgb)_human
    
    y = labels_n[:, targetVal]
    estimator = SVR(kernel = "linear")
    selector = RFE(estimator, 
                   n_features_to_select = noFeaturesToSelect, 
                   step = 1)
    selector = selector.fit(X, y)
    
    support = selector.support_
    
    i = 0
    
    features = X
    noCols = features.shape
    dropFeatures = []
    for i in range(noCols[1]):
        if(support[i] == False):
            dropFeatures.append(i)
    
    #Dropping the features after performing RFE
    updatedFeatures = np.delete(features, dropFeatures, 1) 
    updatedValidation = np.delete(validation, dropFeatures, 1) 
    updatedTest = np.delete(test, dropFeatures, 1) 
    
    print("Feature Indexes to be dropped: ")
    print(dropFeatures)
    
    print("Train Data: ", updatedFeatures.shape)
    print("Validation Data: ", updatedValidation.shape)
    print("Test Data: ", updatedTest.shape)
    
    return updatedFeatures, updatedValidation, updatedTest

### 2. Recursive Feature Elimination Cross Validation

In [5]:
def RFECV_DimReduction(X, labels_n, targetVal, noFeaturesToSelect, 
                       validation, validationLabels_n, test):
    #targetVal denotes the target which we wish to predit
    # i.e. Hdl_cholesterol_human, Cholesterol_ldl_human, 
    #Hemoglobin(hgb)_human
    
    y = labels_n[:, targetVal]
    estimator = SVR(kernel = "linear")
    selector = RFECV(estimator, step = 1)
    selector = selector.fit(X, y)
    
    support = selector.support_

    i = 0
    
    features = X
    dropFeatures = []
    for i in range(173):
        if(support[i] == False):
            dropFeatures.append(i)
    
    #Dropping the features after performing RFE
    updatedFeatures = np.delete(features, dropFeatures, 1) 
    updatedValidation = np.delete(validation, dropFeatures, 1) 
    updatedTest = np.delete(test, dropFeatures, 1) 
    
    print("Feature Indexes to be dropped: ")
    print(dropFeatures)
    
    print("Train Data: ", updatedFeatures.shape)
    print("Validation Data: ", updatedValidation.shape)
    print("Test Data: ", updatedTest.shape)
    
    return updatedFeatures, updatedValidation, updatedTest

### 3. Principle Component Analysis

In [6]:
def ApplyPCA(features, labels_n, nofeature, validation, validationLabels_n, 
              test, noFeatures):
    pca = PCA(n_components = noFeatures)
    pca.fit(features)
    updatedFeatures = pca.transform(features)
        
    print("Dimenssion of feature array after applying PCA: ", 
          updatedFeatures.shape)
    updatedValidation = pca.transform(validation)
    print("Dimenssion of validation array after applying PCA: ", 
          updatedValidation.shape)
    updatedTest = pca.transform(test)
    print("Dimenssion of test array after applying PCA: ", 
          updatedTest.shape)
    
    return updatedFeatures, updatedValidation, updatedTest