<a href="https://colab.research.google.com/github/sureshmecad/Google-Colab/blob/master/7_Outliers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

train = pd.read_csv('../input/cost-of-living/cost-of-living-2018.csv')
sns.boxplot(train['Cost of Living Index'])
plt.title("Box Plot before outlier removing")
plt.show()

def drop_outliers(df, field_name):
    iqr = 1.5 * (np.percentile(df[field_name], 75) - np.percentile(df[field_name], 25))
    df.drop(df[df[field_name] > (iqr + np.percentile(df[field_name], 75))].index, inplace=True)
    df.drop(df[df[field_name] < (np.percentile(df[field_name], 25) - iqr)].index, inplace=True)
    
drop_outliers(train, 'Cost of Living Index')
sns.boxplot(train['Cost of Living Index'])
plt.title("Box Plot after outlier removing")
plt.show()

https://medium.com/analytics-vidhya/outlier-detection-in-machine-learning-26e0bc7105cd

### What are outliers ?

- suppose we have a dataset consisting the distance between school and home of the students in km which is given as **5,6,7,8,9,10,6,7,100** here in this dataset **100** will behave like an outlier.

### Should we remove outliers or not ?

- Suppose atleast 30%( or a large amount) of datapoints are outliers means there is some interesting and meaningful insight in outliers and you should not remove it.

### Detection of outliers based on Distributions

i. Normally Distributed data :
- In case of normal distribution if datapoints lie away from the range (μ + 3σ) and (μ — 3 σ) is considered as outliers.

In [None]:
# Plotting and observing whether dataset has outliers or not
plt.figure(figsize=(16, 4)) # figure size
plt.subplot(1, 3, 1) #multiple plot plottig and 1st position of fig
sns.distplot(df["RM"], bins=30) #checking is data normally dist ?
plt.title('Histogram') 
plt.subplot(1, 3, 2)  #plotting second position of figure
sns.boxplot(y=df["RM"]) #boxplot
plt.title('Boxplot')
plt.subplot(1, 3, 3) #plotting third position of figure
stats.probplot(df["RM"], dist="norm", plot=plt) #q-q plot to check #how our data is distributed in reference with normal distribution
plt.ylabel('RM quantiles')
plt.show()

### To find minimum and maximum boundary value

In [None]:
# outlier boundary value for normally distributed dataset
def min_max_boundary(data,col):
    min_value=df[col].mean()-3*df[col].std()
    max_value=df[col].mean()+3*df[col].std()
    return min_value,max_value
min_max_boundary(df,"RM")

#### Any value more than 8.39 and less than 4.17 would be considered as outliers.

### Removing outliers :

In [None]:
#filtering all the value mabove maximum boundary value and below #minimum  boundary value 
df=df[(df["RM"] >4.683568137432223) & (df["RM"] < 7.7636498112857)]

#plotting the df["RM"] after removing outliers
plt.figure(figsize=(16, 4)) 
plt.subplot(1, 3, 1)  
sns.distplot(df["RM"], bins=30)  
plt.title('Histogram') 
plt.subplot(1, 3, 2)   
sns.boxplot(y=df["RM"]) 
plt.title('Boxplot')
plt.subplot(1, 3, 3) 
stats.probplot(df["RM"], dist="norm", plot=plt)
plt.ylabel('RM quantiles')
plt.show()

### ii. Skewed Distributed data :
- If value doesnot lie in between the range :- 25percentile- (1.5*IQR) & 75percentile+(1.5*IQR) then datpoint is considered as outliers.
- Here IQR= Q3-Q1

In [None]:
#plotting different plot to analyse presence of outliers
plt.figure(figsize=(16, 4)) # figure size
plt.subplot(1, 3, 1) #multiple plot plottig and 1st position of figure
sns.distplot(df["LSTAT"], bins=30) #checking data is normally distributed or not
plt.title('Histogram',fontsize=20) 
plt.subplot(1, 3, 2)  #plotting second position of figure
sns.boxplot(y=df["LSTAT"]) #boxplot
plt.title('Boxplot',fontsize=20)
plt.subplot(1, 3, 3) #plotting third position of figure
stats.probplot(df["LSTAT"], dist="norm", plot=plt)#q-q plot to check how our data is distributed in reference with normal distribution
plt.title("Q-Q plot",fontsize=20)
plt.show()

### To find minimum and maximum boundary value

In [None]:
# finding upper and lower boundary limit
def non_normal_outliers(data,col):
    IQR=df[col].quantile(0.75)-df[col].quantile(0.25)
    lower_limit=data[col].quantile(0.75) + (1.5*IQR)
    upper_limit=data[col].quantile(0.25) - (1.5*IQR)
    return "lower limit of dataset : {0},  upper limit of dataset 
            {1}".format(lower_limit,upper_limit)
  non_normal_outliers(df,"LSTAT")


# we can write same code in another way to print minimum and maximum value in the form of list.
list1=[]
def outer_function(data,col):
    # Hidden from the outer code
    IQR=df[col].quantile(0.75)-df[col].quantile(0.25)
    def max_value(data,col):
        max_=df[col].quantile(0.75) + (1.5*IQR)
        return max_
    list1.append(max_value(data,col))
    def min_value (data,col):
        min_=df[col].quantile(0.25) - (1.5*IQR)
        return min_ 
    list1.append(min_value(data,col))
#inner_increment(5)
outer_function(df,"LSTAT")
list1

### Removing outliers :

In [None]:
# filtering values lie above and below min and max value
df=df.loc[(df["LSTAT"]<list1[0]) & (df["LSTAT"]>list1[1])]
--------------------------------------------------------------------
# plotting the dataset after eliminating outliers
plt.figure(figsize=(16, 4)) 
plt.subplot(1, 3, 1)  
sns.distplot(df["LSTAT"], bins=30)  
plt.title('Histogram') 
plt.subplot(1, 3, 2)   
sns.boxplot(y=df["LSTAT"]) 
plt.title('Boxplot')
plt.subplot(1, 3, 3) 
stats.probplot(df["LSTAT"], dist="norm", plot=plt)
plt.ylabel('RM quantiles')
plt.show()