# 2. Data pre-processing operations such as outliers and/or inconsistent data value management

In [8]:
# Importing required libraries
import statistics as st            # For calculating statistical measures like mean, median, etc.
import numpy as np                 # For numerical computations
import pandas as pd                # (Not used here, but often useful for handling tabular data)
import matplotlib.pyplot as plt    # For plotting (not used in this code but imported)
import seaborn as sns              # For visualizations (not used in this code but imported)

# Sample dataset
data = [15, 101, 18, 7, 13, 16, 11, 21, 5, 15, 10, 9]

# Printing basic statistical metrics
print('Mean           : ' , st.mean(data))          # Average value
print('Median         : ', st.median(data))         # Middle value
print('Mode           : ', st.mode(data))           # Most frequent value
print('Varience       : ', st.variance(data))       # Measure of data spread (square of std dev)
print('Std Deviation  : ', st.stdev(data))          # Standard deviation


Mean           :  20.083333333333332
Median         :  14.0
Mode           :  15
Varience       :  670.6287878787879
Std Deviation  :  25.896501460212495


## Removing outliers using Z-Score method

In [9]:
# --- Removing outliers using Z-Score method ---

# Initialize a list to collect outliers
outliers = []

# Function to detect outliers using Z-score method
def detect_outliers_zscore(data):
    thres = 3                              # Z-score threshold
    mean = np.mean(data)                  # Mean of the data
    std = np.std(data)                    # Standard deviation
    for i in data:
        z_score = (i - mean) / std        # Calculate Z-score
        if np.abs(z_score) > thres:       # If Z-score exceeds threshold, consider it an outlier
            outliers.append(i)
    return outliers

# Detect and remove outliers
out = detect_outliers_zscore(data)
NewData = [i for i in data if i not in out]  # Create new list excluding outliers

# Compare statistics before and after outlier removal
print("               with Outlier          without Outlier")
print("Mean           : ", round(st.mean(data),2), "\t\t", round(st.mean(NewData),2))
print('Median         : ', round(st.median(data),2), "\t\t\t", round(st.median(NewData),2))
print('Mode           : ', round(st.mode(data),2), "\t\t\t", round(st.mode(NewData),2))
print('Variance       : ', round(st.variance(data),2), "\t\t", round(st.variance(NewData),2))
print('Std deviation  : ', round(st.stdev(data),2), "\t\t\t", round(st.stdev(NewData),2))

               with Outlier          without Outlier
Mean           :  20.08 		 12.73
Median         :  14.0 			 13
Mode           :  15 			 15
Variance       :  670.63 		 23.42
Std deviation  :  25.9 			 4.84


##  Removing outliers using IQR method

In [11]:
# --- Removing outliers using IQR method ---

# Reset outliers list
outliers = []

# Function to detect outliers using IQR (Interquartile Range) method
def detect_outliers_iqr(data):
    data = sorted(data)                          # Sort the data
    q1 = np.percentile(data, 25)                 # First quartile (25th percentile)
    q3 = np.percentile(data, 75)                 # Third quartile (75th percentile)
    IQR = q3 - q1                                # Interquartile range
    lwr_bound = q1 - (1.5 * IQR)                 # Lower bound for outlier detection
    upr_bound = q3 + (1.5 * IQR)                 # Upper bound for outlier detection
    for i in data:
        if i < lwr_bound or i > upr_bound:       # Values outside bounds are outliers
            outliers.append(i)
    return outliers

# Detect outliers using IQR method
sample_outliers = detect_outliers_iqr(data)
print("Outliers from IQR method: ", sample_outliers)


Outliers from IQR method:  [101]


In [10]:
# --- Percentile Capping (also called Winsorization) ---

# Calculate 10th and 90th percentiles
tenth_percentile = np.percentile(data, 10)
ninetieth_percentile = np.percentile(data, 90)

# Cap the values below 10th percentile to 10th, and above 90th to 90th percentile
b = np.where(data < tenth_percentile, tenth_percentile, data)
b = np.where(b > ninetieth_percentile, ninetieth_percentile, b)

# Print capped data
print("10 %", tenth_percentile, " \n90 %", ninetieth_percentile, "\nNew array:", b)


10 % 7.2  
90 % 20.700000000000003 
New array: [15.  20.7 18.   7.2 13.  16.  11.  20.7  7.2 15.  10.   9. ]
