# Outlier Detection Process

* Create Importtant Functions for Outliers
  1. Create Function to detect outliers
  2. Are there outliers in the desired columns?
  3. Create function to seperate categroical and numerical variables
  4. Are there outliers in the dataframe?
  5. Create outlier detection function

* Univariate Outlier Analysis
  1. Remove outliers
  2. Fill with average outliers
  3. Fill with median outliers
  4. Replace with Threshold

* Multivariate Outlier Analysis
  1. Remove outliers
  2. Replace with threshold

# Import Necessary Libraries

In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import LocalOutlierFactor
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
pd.set_option("display.width", 500)
pd.set_option("display.float_format", lambda x: "%.3f" % x)
import warnings
warnings.filterwarnings("ignore")

# Import Dataset

In [2]:
titanic = sns.load_dataset("titanic")
df = titanic.copy()
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.283,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


# Create Function to detect outliers

In [3]:
def outlier_threshold(dataframe, col_name, q1=0.25, q3=0.75):
  quartile1 = dataframe[col_name].quantile(q1)
  quartile3 = dataframe[col_name].quantile(q3)
  interquartile_range = quartile3 - quartile1
  low_limit = quartile1 - 1.5 * interquartile_range
  up_limit = quartile3 + 1.5 * interquartile_range
  return low_limit, up_limit

In [4]:
outlier_threshold(df, "age")

(-6.6875, 64.8125)

# Are there outliers in the desired columns?

In [5]:
def check_outlier(dataframe, col_name):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
    return True
  else:
    return False

In [6]:
check_outlier(df, "age")

True

In [7]:
check_outlier(df, "fare")

True

# Create function to seperate categroical and numerical variables

In [8]:
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if str(dataframe[col].dtypes) in ["category", "object", "bool"]]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes in ["uint8", "int64", "float64"]]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and str(dataframe[col].dtypes) in ["category", "object"]]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]

    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes in ["uint8", "int64", "float64"]]
    num_cols = [col for col in num_cols if col not in cat_cols]

    return cat_cols, num_cols, cat_but_car, num_but_cat

In [9]:
cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(df)

In [10]:
print(f"observations: {df.shape[0]}")
print(f"variables: {df.shape[1]}")
print(f"cat_cols: {len(cat_cols)}")
print(f"num_cols: {len(num_cols)}")
print(f"cat_but_car: {len(cat_but_car)}")
print(f"num_but_cat: {len(num_but_cat)}")

observations: 891
variables: 15
cat_cols: 13
num_cols: 2
cat_but_car: 0
num_but_cat: 4


# Are there outliers in the dataframe?

In [11]:
def check_outlier_dataframe(dataframe):
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(df)
  for col in num_cols:
    print(col, check_outlier(dataframe, col))

In [12]:
check_outlier_dataframe(df)

age True
fare True


# Create outlier detection function

In [13]:
def grab_outliers(dataframe, col_name, index=False):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  if dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].shape[0] > 10:
    print(dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].head())
  else:
    print(dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)])

  if index:
    outlier_index = dataframe[(dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)].index
    return outlier_index

In [14]:
outlier_index = grab_outliers(df, "age", index=True)

     survived  pclass   sex    age  sibsp  parch   fare embarked   class  who  adult_male deck  embark_town alive  alone
33          0       2  male 66.000      0      0 10.500        S  Second  man        True  NaN  Southampton    no   True
54          0       1  male 65.000      0      1 61.979        C   First  man        True    B    Cherbourg    no  False
96          0       1  male 71.000      0      0 34.654        C   First  man        True    A    Cherbourg    no   True
116         0       3  male 70.500      0      0  7.750        Q   Third  man        True  NaN   Queenstown    no   True
280         0       3  male 65.000      0      0  7.750        Q   Third  man        True  NaN   Queenstown    no   True


In [15]:
outlier_index

Int64Index([33, 54, 96, 116, 280, 456, 493, 630, 672, 745, 851], dtype='int64')

# Univariate Outlier Analysis

# Remove outliers

In [16]:
def remove_outlier(dataframe, col_name):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  df_without_outliers = dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]
  return df_without_outliers

In [17]:
df_without_outliers = remove_outlier(df, "age")

In [18]:
df.shape

(891, 15)

In [19]:
df_without_outliers.shape

(880, 15)

In [20]:
def remove_outlier_dataframe(dataframe):
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    df_without_outliers = remove_outlier(dataframe, col)
  return df_without_outliers

In [21]:
df_without_outliers = remove_outlier_dataframe(df)

In [22]:
df_without_outliers.shape

(775, 15)

# Fill with average outliers

In [23]:
def fill_with_average_outliers(dataframe, col_name):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  dataframe.loc[((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)), col_name] = dataframe[col_name].mean()
  return dataframe

In [24]:
yeni_df = fill_with_average_outliers(df, "age")

In [25]:
low_limit, up_limit = outlier_threshold(yeni_df, "age")
print((low_limit, up_limit))

(-5.1875, 62.3125)


In [26]:
def fill_with_average_outliers_dataframe(dataframe):
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    fill_with_average_outliers(dataframe, col)
  return dataframe

In [27]:
yeni_df = fill_with_average_outliers_dataframe(df)

In [28]:
low_limit, up_limit = outlier_threshold(yeni_df, "fare")
print((low_limit, up_limit))

(-26.724, 65.6344)


# Fill with median outliers

In [29]:
def fill_with_median_outliers(dataframe, col_name):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  dataframe.loc[((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit)), col_name] = dataframe[col_name].median()
  return dataframe

In [30]:
yeni_df = fill_with_median_outliers(df, "age")

In [31]:
low_limit, up_limit = outlier_threshold(yeni_df, "age")
print((low_limit, up_limit))

(-5.0, 62.0)


In [32]:
def fill_with_median_outliers_dataframe(dataframe):
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    fill_with_median_outliers(dataframe, col)
  return dataframe

In [33]:
yeni_df = fill_with_median_outliers_dataframe(df)

In [34]:
low_limit, up_limit = outlier_threshold(yeni_df, "fare")
print((low_limit, up_limit))

(-26.724, 65.6344)


# Replace with Threshold

In [37]:
def replace_with_thresholds(dataframe, col_name):
  low_limit, up_limit = outlier_threshold(dataframe, col_name)
  dataframe.loc[(dataframe[col_name] < low_limit), col_name] = low_limit
  dataframe.loc[(dataframe[col_name] > up_limit), col_name] = up_limit

In [38]:
replace_with_thresholds(df, "age")

In [39]:
def replace_with_thresholds_dataframe(dataframe):
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  for col in num_cols:
    replace_with_thresholds(dataframe, col)

In [40]:
replace_with_thresholds_dataframe(df)

# Multivariate Outlier Analysis

# Remove Outliers

In [59]:
def multivariate_remove_outliers(dataframe, n_neighbors=20, contamination=0.1, threshold_number=10):
  dataframe = dataframe.dropna() # remove missing values
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  dataframe = dataframe[num_cols]
  clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
  clf.fit_predict(dataframe)
  df_scores = clf.negative_outlier_factor_
  threshold_value = np.sort(df_scores)[threshold_number]
  dataframe = dataframe.loc[df_scores > threshold_value]
  return dataframe

In [60]:
dataframe = multivariate_remove_outliers(df)

In [57]:
dataframe.head()

Unnamed: 0,age,fare
1,38.0,32.204
3,35.0,53.1
11,58.0,26.55
21,34.0,13.0
23,28.0,35.5


In [58]:
dataframe.shape

(168, 2)

# Replace With Thresholds (NOT RECOMMENDED)

In [66]:
def multivariate_replace_with_thresholds(dataframe, n_neighbors=20, contamination=0.1, threshold_number=10):
  dataframe = dataframe.dropna() # remove missing values
  cat_cols, num_cols, cat_but_car, num_but_cat = grab_col_names(dataframe)
  dataframe = dataframe[num_cols]
  clf = LocalOutlierFactor(n_neighbors=n_neighbors, contamination=contamination)
  clf.fit_predict(dataframe)
  df_scores = clf.negative_outlier_factor_
  threshold_value = np.sort(df_scores)[threshold_number]
  threshold_row = dataframe[df_scores == threshold_value]
  no_outlier_df = df_scores > threshold_value
  outliers = dataframe[~no_outlier_df]
  results = outliers.to_records(index=False)
  results[:] = threshold_row.to_records(index=False)
  dataframe[~no_outlier_df] = pd.DataFrame(results, index=dataframe[~no_outlier_df].index)
  return dataframe[~no_outlier_df]

In [67]:
multivariate_replace_with_thresholds(df)

Unnamed: 0,age,fare
10,4.0,32.204
183,4.0,32.204
193,4.0,32.204
205,4.0,32.204
297,4.0,32.204
305,4.0,32.204
340,4.0,32.204
445,4.0,32.204
583,4.0,32.204
618,4.0,32.204
