# Outlier Analysis
## Outlier Detection

In [None]:
import seaborn as sns
df = sns.load_dataset("diamonds", cache = False)
df = df.select_dtypes(include = ["float64", "int64"])
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]

In [None]:
df_table.head()

In [None]:
sns.boxplot(x = df_table);

In [None]:
Q1 = df_table.quantile(0.25)

In [None]:
Q3 = df_table.quantile(0.75)

In [None]:
IQR = Q3 - Q1

In [None]:
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

In [None]:
lower_bound

In [None]:
upper_bound

In [None]:
outlier = (df_table < lower_bound) | (df_table > upper_bound)

In [None]:
df_table[outlier]

In [None]:
df_table[outlier].index

### Deletion

In [None]:
import pandas as pd

In [None]:
type(df_table)

In [None]:
df_table = pd.DataFrame(df_table)

In [None]:
df_table.shape

In [None]:
new_df = df_table[~((df_table < lower_bound) | (df_table > upper_bound)).any(axis = 1)]
new_df

In [None]:
new_df.shape

### Filling with Mean

In [None]:
df = sns.load_dataset("diamonds", cache = False)
df = df.select_dtypes(include = ["float64", "int64"])
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]
df_table.head()

In [None]:
df_table.mean()

In [None]:
df_table[outlier] = df_table.mean()

In [None]:
df_table[outlier]

### Suppression

In [None]:
df = sns.load_dataset("diamonds", cache = False)
df = df.select_dtypes(include = ["float64", "int64"])
df = df.dropna()
df.head()

In [None]:
df_table = df["table"]
df_table.head()

In [None]:
outlier_low = (df_table < lower_bound)
outlier_up = (df_table > upper_bound)
df_table[outlier_low] = lower_bound
df_table[outlier_up] = upper_bound

In [None]:
df_table[outlier_low]

In [None]:
df_table[outlier_up]

# Multivariate Outlier Analysis

## Local Outlier Factor

Observations enable us to identify potential outliers by scoring based on density at their respective locations.

The local density of a point is compared with its neighbors. If a point has significantly lower density than its neighbors, it is interpreted as being in a sparser region than its neighbors. Thus, a neighborhood structure is present here. If the surroundings of a value are not dense, it is considered an outlier.


In [None]:
import seaborn as sns
diamonds = sns.load_dataset('diamonds')
diamonds = diamonds.select_dtypes(include = ['float64', 'int64']) 
df = diamonds.copy()
df = df.dropna()
df.head()

In [None]:
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

In [None]:
clf = LocalOutlierFactor(n_neighbors = 20, contamination = 0.1)

In [None]:
clf.fit_predict(df)

In [None]:
df_scores = clf.negative_outlier_factor_

In [None]:
df_scores[:10]

In [None]:
np.sort(df_scores)[:20]

In [None]:
np.sort(df_scores)[-20:]

In [None]:
threshold = np.sort(df_scores)[13]
threshold

In [None]:
outlier_tf = df_scores < threshold
outlier_tf

In [None]:
new_df = df[outlier_tf]
new_df

In [None]:
new_df = df[~outlier_tf]
new_df

In [None]:
df[df_scores == threshold]

#### Suppressor method

In [None]:
suppressor_value = df[df_scores == threshold]

In [None]:
outliers = df[outlier_tf]

In [None]:
outliers

In [None]:
res = outliers.to_records(index = False)
res

In [None]:
res[:] = suppressor_value.to_records(index = False)
res

In [None]:
import pandas as pd
df[outlier_tf] = pd.DataFrame(res, index = df[outlier_tf].index)

In [None]:
df[outlier_tf]

# Missing Data Analysis

## Fast Solution

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.isnull().sum()

In [None]:
df.notnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.isnull()

In [None]:
df[df.isnull().any(axis = 1)]

In [None]:
df[df.notnull().all(axis = 1)]

In [None]:
df[df["V1"].notnull() & df["V2"].notnull() & df["V3"].notnull()]

In [None]:
df[df["V1"].isnull() | df["V2"].isnull() | df["V3"].isnull()]

In [None]:
#Deletion
df.dropna()

In [None]:
#Simple Imputation

In [None]:
df["V1"]

In [None]:
df["V1"].mean()

In [None]:
df["V1"].fillna(df["V1"].mean())

In [None]:
df["V2"]

In [None]:
df["V2"].fillna(0)

In [None]:
df

In [None]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

## Missing Value Visualization

In [None]:
!pip install missingno

In [None]:
import missingno as msno

In [None]:
msno.bar(df);

In [None]:
msno.matrix(df);

In [None]:
df = sns.load_dataset("planets", cache = False)
df.head()

In [None]:
msno.matrix(df);

In [None]:
msno.heatmap(df);

## Deletion Methods

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])

df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3}        
)

df

In [None]:
df.dropna()

In [None]:
df.dropna(how = "all")

In [None]:
df.dropna(axis = 1)

## Imputation

### Numeric

In [None]:
df

In [None]:
df.apply(lambda x: x.fillna(x.mean()), axis = 0)

In [None]:
df.fillna(df.mean()[:])

In [None]:
df.fillna(df.mean()["V1":"V2"])

In [None]:
df.where(pd.notna(df), df.mean(), axis = "columns")

In [None]:
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V2 = np.array([7,np.NaN,5,8,12,np.NaN,np.NaN,2,3])
V3 = np.array([np.NaN,12,5,6,14,7,np.NaN,2,31])
V4 = np.array(["IT","IT","HR","HR","HR","HR","HR","IT","IT"])

df = pd.DataFrame(
        {"salary" : V1,
         "V2" : V2,
         "V3" : V3,
        "department" : V4}        
)

df

In [None]:
df.groupby("department")["salary"].mean()

In [None]:
df["salary"]

In [None]:
df["salary"].fillna(df.groupby("department")["salary"].transform("mean"))

### Categorical

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,np.NaN,7,1,np.NaN,9,15])
V4 = np.array(["IT",np.nan,"HR","HR","HR","HR","HR","IT","IT"], dtype=object)

df = pd.DataFrame(
        {"salary" : V1,
        "department" : V4}        
)

df

In [None]:
df["department"].mode()

In [None]:
df["department"].fillna(df["department"].mode()[0])

In [None]:
df["department"].fillna(method="bfill")

In [None]:
df["department"].fillna(method="ffill")

# Prediction-Based Imputation Methods

In [None]:
import seaborn as sns
import missingno as msno
df = sns.load_dataset("titanic", cache=False)
df = df.select_dtypes(include = ["int64", "float64"])
print(df.head())
df.isnull().sum()

In [None]:
!pip install ycimpute

## KNN

In [None]:
from ycimpute.imputer import knnimput

In [None]:
var_names = list(df)

In [None]:
import numpy as np
n_df = np.array(df)

In [None]:
n_df[:10]

In [None]:
n_df.shape

In [None]:
dff = knnimput.KNN(k = 4).complete(n_df)

In [None]:
type(dff)

In [None]:
dff = pd.DataFrame(dff, columns=var_names)

In [None]:
dff.isnull().sum()

### Random Forest

import seaborn as sns
import missingno as msno
df = sns.load_dataset("titanic", cache=False)
df = df.select_dtypes(include = ["int64", "float64"])
df.isnull().sum()

var_names = list(df)

import numpy as np
n_df = np.array(df)

from ycimpute.imputer import iterforest
dff = iterforest.IterImput().complete(n_df)

## EM

In [None]:
import seaborn as sns 
import missingno as msno 
df = sns.load_dataset("titanic", cache=False) 
df = df.select_dtypes(include = ["int64", "float64"])
df.isnull().sum()


In [None]:
var_names = list(df)

import numpy as np 
n_df = np.array(df)

In [None]:
from ycimpute.imputer import EM

In [None]:
dff = EM().complete(n_df)

In [None]:
dff = pd.DataFrame(dff, columns=var_names)
dff.isnull().sum()

# Data Standardization

In [None]:
import numpy as np
import pandas as pd
V1 = np.array([1,3,6,5,7])
V2 = np.array([7,7,5,8,12])
V3 = np.array([6,12,5,6,14])
df = pd.DataFrame(
        {"V1" : V1,
         "V2" : V2,
         "V3" : V3})

df = df.astype(float)
df

## Standardization

In [None]:
from sklearn import preprocessing

In [None]:
preprocessing.scale(df)

## Normalization

In [None]:
preprocessing.normalize(df)

## Binarize

In [None]:
binarizer = preprocessing.Binarizer(threshold=5).fit(df)
binarizer.transform(df)

## Min-Max

In [None]:
scaler = preprocessing.MinMaxScaler(feature_range=(10, 20))

In [None]:
scaler.fit_transform(df)

# Variable Transformations

In [None]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

## 0-1 Transformation

In [None]:
from sklearn.preprocessing import LabelEncoder
lbe = LabelEncoder()

In [None]:
lbe.fit_transform(df["sex"])

In [None]:
df["n_sex"] = lbe.fit_transform(df["sex"])
df

## 1 and Others(0) Transformation

In [None]:
df["n_day"] = np.where(df["day"].str.contains("Sun"), 1, 0)
df

In [None]:
df["n_day"].value_counts()

In [None]:
df["day"].value_counts()

## Multi-class Transformation

In [None]:
lbe = LabelEncoder()

In [None]:
lbe.fit_transform(df["day"])

## One-Hot Transformation and Dummy Variable

In [None]:
df.head()

In [None]:
df_one_hot = pd.get_dummies(df, columns= ["sex"], prefix = ["sex"])
df_one_hot.head()

In [None]:
pd.get_dummies(df, columns= ["day"], prefix = ["day"], dtype = int)

## Converting a Continuous Variable to a Categorical Variable

In [None]:
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

In [None]:
dff = df.select_dtypes(include = ["float64", "int64"])

In [None]:
est = preprocessing.KBinsDiscretizer(n_bins=[3,2,2], encode = "ordinal", strategy="quantile").fit(dff)

In [None]:
est.transform(dff)[:10]

## Index to Variable

In [None]:
df.head()

In [None]:
df["new_var"] = df.index

In [None]:
df

In [None]:
df["new_var"] = df["new_var"] + 10

In [None]:
df.head()

In [None]:
df.index = df["new_var"]

In [None]:
df.index

In [None]:
df.head()