In [1]:
#############################################
# FEATURE ENGINEERING & DATA PRE-PROCESSING
#############################################

import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import missingno as msno
from sklearn.neighbors import LocalOutlierFactor

# Load datasets
def load_application_train():
    return pd.read_csv("application_train.csv")

def load():
    return pd.read_csv("titanic.csv")

# See the shape of the datasets
df_application = load_application_train()
print(df_application.shape)  # (307511, 122)

df_titanic = load()
print(df_titanic.shape)  # (891, 12)

# Detect outliers in 'Age' column using boxplot
sns.boxplot(x=df_titanic["Age"])
plt.show()

# Find Q1, Q3, and IQR for 'Age'
q1 = df_titanic["Age"].quantile(0.25)
q3 = df_titanic["Age"].quantile(0.75)
iqr = q3 - q1
up = q3 + 1.5 * iqr
low = q1 - 1.5 * iqr

# See outliers in 'Age'
print(df_titanic[(df_titanic["Age"] < low) | (df_titanic["Age"] > up)])

# Index of outliers
print(df_titanic[(df_titanic["Age"] < low) | (df_titanic["Age"] > up)].index)

# Check for any outliers using any() function
print(df_titanic[(df_titanic["Age"] < low) | (df_titanic["Age"] > up)].any(axis=None))  # True
print(df_titanic[(df_titanic["Age"] < low)].any(axis=None))  # False

# Define function for outlier thresholds
def outlier_thresholds(dataframe, col_name, q1=0.25, q3=0.75):
    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquartile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquartile_range
    low_limit = quartile1 - 1.5 * interquartile_range
    return low_limit, up_limit

print(outlier_thresholds(df_titanic, "Age"))  # (-6.6875, 64.8125)

low, up = outlier_thresholds(df_titanic, "Fare")
print(df_titanic[(df_titanic["Fare"] < low) | (df_titanic["Fare"] > up)].head())

# Function to check if there are outliers
def check_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None)

print(check_outlier(df_titanic, "Age"))  # True
print(check_outlier(df_titanic, "Fare"))  # True

# Function to grab column names
def grab_col_names(dataframe, cat_th=10, car_th=20):
    cat_cols = [col for col in dataframe.columns if dataframe[col].dtypes == "O"]
    num_but_cat = [col for col in dataframe.columns if dataframe[col].nunique() < cat_th and dataframe[col].dtypes != "O"]
    cat_but_car = [col for col in dataframe.columns if dataframe[col].nunique() > car_th and dataframe[col].dtypes == "O"]
    cat_cols = cat_cols + num_but_cat
    cat_cols = [col for col in cat_cols if col not in cat_but_car]
    num_cols = [col for col in dataframe.columns if dataframe[col].dtypes != "O" and col not in num_but_cat]

    print(f"Observations: {dataframe.shape[0]}")
    print(f"Variables: {dataframe.shape[1]}")
    print(f"cat_cols: {len(cat_cols)}")
    print(f"num_cols: {len(num_cols)}")
    print(f"cat_but_car: {len(cat_but_car)}")
    print(f"num_but_cat: {len(num_but_cat)}")

    return cat_cols, num_cols, cat_but_car

cat_cols, num_cols, cat_but_car = grab_col_names(df_titanic)
num_cols = [col for col in num_cols if col != "PassengerId"]
print(num_cols)  # ['Age', 'Fare']

# Check outliers in numerical columns
for col in num_cols:
    print(col, check_outlier(df_titanic, col))

dff = load_application_train()
cat_cols, num_cols, cat_but_car = grab_col_names(dff)
num_cols.remove('SK_ID_CURR')

# Check outliers in application_train dataset
for col in num_cols:
    print(col, check_outlier(dff, col))

# Function to grab outliers
def grab_outliers(dataframe, col_name, outlier_index=False, f=5):
    low, up = outlier_thresholds(dataframe, col_name)
    outliers = dataframe[((dataframe[col_name] < low) | (dataframe[col_name] > up))]
    
    if outliers.shape[0] > 10:
        print(outliers.head(f))
    else:
        print(outliers)
    
    if outlier_index:
        return outliers.index

age_index = grab_outliers(df_titanic, "Age", True)
print(age_index)

# Remove outliers
def remove_outlier(dataframe, col_name):
    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    return dataframe[~((dataframe[col_name] < low_limit) | (dataframe[col_name] > up_limit))]

df_titanic = load()
cat_cols, num_cols, cat_but_car = grab_col_names(df_titanic)
num_cols.remove('PassengerId')

for col in num_cols:
    df_titanic = remove_outlier(df_titanic, col)

print(df_titanic.shape)  # (765,12)

# Reassign outliers with thresholds
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

df_titanic = load()
cat_cols, num_cols, cat_but_car = grab_col_names(df_titanic)
num_cols.remove('PassengerId')

for col in num_cols:
    replace_with_thresholds(df_titanic, col)

for col in num_cols:
    print(col, check_outlier(df_titanic, col))

# Process diamonds dataset
df_diamonds = sns.load_dataset('diamonds')
df_diamonds = df_diamonds.select_dtypes(include=['float64', 'int64']).dropna()
print(df_diamonds.shape)  # (53940, 7)

for col in df_diamonds.columns:
    print(col, check_outlier(df_diamonds, col))

low, up = outlier_thresholds(df_diamonds, "carat")
print(df_diamonds[((df_diamonds["carat"] < low) | (df_diamonds["carat"] > up))].shape)  # (1889, 7)

low, up = outlier_thresholds(df_diamonds, "depth")
print(df_diamonds[((df_diamonds["depth"] < low) | (df_diamonds["depth"] > up))].shape)  # (2545, 7)

# Local Outlier Factor (LOF)
clf = LocalOutlierFactor(n_neighbors=20)
clf.fit_predict(df_diamonds)
df_scores = clf.negative_outlier_factor_
print(np.sort(df_scores)[0:5])  # array([-8.60430658, -8.20889984, -5.86084355, -4.98415175, -4.81502092])

# Plot LOF scores
scores = pd.DataFrame(np.sort(df_scores))
scores.plot(stacked=True, xlim=[0, 20], style='.-')
plt.show()

# Set threshold and drop outliers
th = np.sort(df_scores)[3]  # -4.9841
print(df_diamonds[df_scores < th])
print(df_diamonds.drop(axis=0, labels=df_diamonds[df_scores < th].index).shape)  # (53937, 7)


ModuleNotFoundError: No module named 'missingno'