DATASETS to Upload:
POKEMON DATASET

#**Extreme Value Analysis Using Univariate Method**

In [None]:
import seaborn as sns

diamonds = sns.load_dataset("diamonds")
diamonds.head()

In [None]:
diamonds.columns

In [None]:
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

# Extract feature and target arrays
X, y = diamonds.drop("carat", axis=1), diamonds["carat"]
# Select categorical column names
cats = X.select_dtypes(exclude=np.number).columns.tolist()

# Encode categoricals
X.loc[:, cats] = OrdinalEncoder().fit_transform(X[cats])

In [None]:
y.describe()

In [None]:
import matplotlib.pyplot as plt
sns.histplot(y)

plt.xlabel("Carat");

In [None]:
X.drop(cats, axis=1).describe()

In [None]:
sns.histplot(X["price"])

plt.title("Histogram of Diamond Pricing")
plt.xlabel("price");

In [None]:
sns.boxplot(x="cut", y="price", data=diamonds, whis=2)

plt.xlabel("")
plt.ylabel("Price");

In [None]:
from scipy.stats import zscore

zscore(y).abs()[:5]

In [None]:
from scipy.stats import zscore

# Calculate the abs of zscore and filter above 3
is_outlier = zscore(y).abs() > 3
outliers_y = y[is_outlier]

num_outliers = len(outliers_y)
print(f"The number of outliers: {num_outliers}")
print(f"Percentage of outliers: {num_outliers / len(y):.3f}")

In [None]:
# Find the median
median_y = y.median()

# Find the abs value of differences between the median
abs_diffs = (y - median_y).abs()
# Take the median of differences
mad = abs_diffs.median()

mad

In [None]:
from scipy.stats import median_abs_deviation

median_abs_deviation(y)

In [None]:
pip install pyod

In [None]:
from pyod.models.mad import MAD

# Reshape the target to make it 2D
y_2d = y.values.reshape(-1, 1)
# Fit to the target
mad = MAD().fit(y_2d)

# Extract the inlier/outlier labels
labels = mad.labels_
labels

In [None]:
outliers_y_mad = y[labels == 1]

num_outliers = len(outliers_y_mad)
print(f"The number of outliers: {num_outliers}")
print(f"Percentage of outliers: {num_outliers / len(y):.3f}")

In [None]:
# Filter for inliers, marked as 0s
inliers = y[labels == 0]

sns.histplot(inliers)

plt.title("Histogram of Diamond Carats After Outliers")
plt.xlabel("Carat");

In [None]:
def tukey_method(data, threshold=1.5):
    q1 = np.percentile(data, 25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    lower_bound = q1 - threshold * iqr
    upper_bound = q3 + threshold * iqr
    outliers_indices = np.where((data < lower_bound) | (data > upper_bound))[0]
    return outliers_indices

In [None]:
list_to_label = tukey_method(y)
len(list_to_label)

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot(y, vert=False, labels=['carat'])
plt.scatter(y[list_to_label], [1] * len(list_to_label), color='red', label='Outliers')
plt.legend()
plt.show()

In [None]:
print("Indexes of Outliers:", list_to_label)
print("Values of Outliers:", y[list_to_label])

In [None]:
import pandas as pd
df = pd.DataFrame(diamonds)
odf = pd.DataFrame(list_to_label, columns = ['index'])

In [None]:
odf['outliers_status'] = 'yes'
odf

In [None]:
df['index'] = df.index
df

In [None]:
df = df.join(odf.set_index('index'), on='index')

In [None]:
df[df['outliers_status'] == 'yes']

#**Multivariate Analysis**

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px

pokemon = pd.read_csv('pokemon.csv')
pokemon.head()

In [None]:
pokemon.info()

In [None]:
#create the plot
ax = sns.boxplot(data = pokemon[["attack", "defense"]], orient = "h", palette = "Set2")
#add labels
ax.set_xlabel("Value", fontsize = 20, labelpad = 20)
ax.set_ylabel("Attributes", fontsize = 20, labelpad = 20)
ax.set_title("Boxplot of pokemon Attack \nand Defense attributes", fontsize = 20,
            pad = 20)
#edit ticks
ax.tick_params(which = "both", labelsize = 15)

In [None]:
def IQR_bounds(dataframe, column_name, multiple):
    """Extract the upper and lower bound for outlier detection using IQR

    Input:
        dataframe: Dataframe you want to extract the upper and lower bound from
        column_name: column name you want to extract upper and lower bound for
        multiple: The multiple to use to extract this

    Output:
        lower_bound = lower bound for column
        upper_bound = upper bound for column"""

    #extract the quantiles for the column
    lower_quantile = dataframe[column_name].quantile(0.25)
    upper_quantile = dataframe[column_name].quantile(0.75)
    #cauclat IQR
    IQR = upper_quantile - lower_quantile

    #extract lower and upper bound
    lower_bound = lower_quantile - multiple * IQR
    upper_bound = upper_quantile + multiple * IQR

    #retrun these values
    return lower_bound, upper_bound


In [None]:
#set the columns we want
columns = ["attack", "defense"]
#create a dictionary to store the bounds
column_bounds = {}
#iteratre over each column to extract bounds
for column in columns:
    #extract normal and extreme bounds
    lower_bound, upper_bound =  IQR_bounds(pokemon, column, 1.5)
    #send them to the dictionary
    column_bounds[column] = [lower_bound, upper_bound]
#create the normal dataframe
pokemon_IQR_AD = pokemon[(pokemon["attack"] < column_bounds["attack"][0]) |
                         (pokemon["attack"] > column_bounds["attack"][1]) |
                         (pokemon["defense"] < column_bounds["defense"][0]) |
                         (pokemon["defense"] > column_bounds["defense"][1])
                        ]

In [None]:
pokemon_IQR_AD.shape

##***Isolation Forest***

In [None]:
from sklearn.ensemble import IsolationForest
#create the method instance
isf = IsolationForest(n_estimators = 100, random_state = 42, contamination = 0.02)
#use fit_predict on the data as we are using all the data
preds = isf.fit_predict(pokemon[["attack", "defense"]])
#extract outliers from the data
pokemon["iso_forest_outliers"] = preds
pokemon["iso_forest_outliers"] = pokemon["iso_forest_outliers"].astype(str)
#extract the scores from the data in terms of strength of outlier
pokemon["iso_forest_scores"] = isf.decision_function(pokemon[["attack", "defense"]])
#print how many outliers the data suggests
print(pokemon["iso_forest_outliers"].value_counts())

In [None]:
#this plot will be repeated so it is better to create a function
def scatter_plot(dataframe, x, y, color, title, hover_name):
    """Create a plotly express scatter plot with x and y values with a colour

    Input:
        dataframe: Dataframe containing columns for x, y, colour and hover_name data
        x: The column to go on the x axis
        y: Column name to go on the y axis
        color: Column name to specify colour
        title: Title for plot
        hover_name: column name for hover

    Returns:
        Scatter plot figure
    """
    #create the base scatter plot
    fig = px.scatter(dataframe, x = x, y=y,
                    color = color,
                     hover_name = hover_name)
    #set the layout conditions
    fig.update_layout(title = title,
                     title_x = 0.5)
    #show the figure
    fig.show()

#create scatter plot
scatter_plot(pokemon, "attack", "defense", "iso_forest_outliers", "Isolation Forest Outlier Detection","name")

In [None]:
scatter_plot(pokemon, "attack", "defense", "iso_forest_scores",
             "Isolation Forest Outlier Detection Scores",
            "name")

In [None]:
sns.histplot(pokemon['iso_forest_scores'])

##***Local Outlier Factor***

In [None]:
#import the algorithm
from sklearn.neighbors import LocalOutlierFactor
#initialise the algorithm
lof = LocalOutlierFactor(n_neighbors = 20)
#fit it to the training data, since we don't use it for novelty than this is fine
y_pred = lof.fit_predict(pokemon[["attack", "defense"]])
#extract the predictions as strings
pokemon["lof_outliers"] = y_pred.astype(str)
#print the number of outliers relative to non-outliers
print(pokemon["lof_outliers"].value_counts())
#extract the outlier scores
pokemon["lof_scores"] = lof.negative_outlier_factor_


In [None]:
scatter_plot(pokemon, "attack", "defense", "lof_outliers", "Local Outlier Factor Detection","name")

In [None]:
scatter_plot(pokemon, "attack", "defense", "lof_scores",
             "Local Outlier Factor Detection Scores",
            "name")

In [None]:
pokemon.info()

In [None]:
pokemon['capture_rate'].unique()

In [None]:
pokemon_num = pokemon[['attack', 'defense', 'capture_rate', 'hp', 'speed', 'height_m', 'weight_kg', 'sp_attack', 'sp_defense', 'iso_forest_outliers', 'iso_forest_scores']]

In [None]:
pokemon_num.info()

In [None]:
pokemon_num.select_dtypes(include = 'number').fillna(pokemon_num.select_dtypes(include = 'number').mean(), inplace=True)

In [None]:
sns.set_theme(style="ticks")
sns.pairplot(pokemon_num, hue='iso_forest_outliers')

##***DBScans***

In [None]:
from sklearn.cluster import DBSCAN

#initiate the algorithm
#set the distance to 20, and min_samples as 5
outlier_detection = DBSCAN(eps = 20, metric = "euclidean", min_samples = 10, n_jobs = -1)

#fit_predict the algorithm to the existing data
clusters = outlier_detection.fit_predict(pokemon[["attack", "defense"]])

#extract the labels from the algorithm
pokemon["dbscan_outliers"] = clusters

#label all others as inliers
pokemon["dbscan_outliers"] = pokemon["dbscan_outliers"].apply(lambda x: str(1) if x>-1 else str(-1))

#print the vaue counts
print(pokemon["dbscan_outliers"].value_counts())

In [None]:
colors = outlier_detection.labels_
plt.scatter(pokemon["attack"], pokemon["defense"], c = colors)