## 1. Noise Removing

In [None]:
# This script automates the detection and removal of outliers from CSV files in a specified directory.
# It uses PyCaret for anomaly detection and a rolling window method to detect outliers.
# The script supports user input to customize hyperparameters and select detection algorithms.
# Results can be plotted or saved based on user preferences.

import os  # For interacting with the file system
import pandas as pd  # For data manipulation and analysis with DataFrames
import matplotlib.pyplot as plt  # For creating visualizations
from pycaret.anomaly import setup, create_model, assign_model  # For anomaly detection with PyCaret
import re  # For regular expressions to find patterns in strings
from collections import Counter  # For counting occurrences of elements in iterables

# Initialize PyCaret for anomaly detection with the given dataset
def setup_pycaret(data, normalize=True, transformation=False, transformation_method='quantile', numeric_imputation='mean'):
    """
    Initializes the dataset for anomaly detection using PyCaret, with optional normalization and transformation.
    
    Args:
    data (pd.DataFrame): The dataset to initialize for anomaly detection.
    normalize (bool): Whether to normalize the data.
    transformation (bool): Whether to apply data transformation.
    transformation_method (str): The transformation method to use ('quantile' by default).
    numeric_imputation (str): Method for imputing missing numerical values ('mean' by default).

    Returns:
    object: The PyCaret setup environment for anomaly detection.
    """
    # Shuffle the data and reset the index to ensure randomness; use all data (frac=1) to detect outliers from
    data = data.sample(frac=1, random_state=786)

    # Initialize the PyCaret setup with the provided parameters
    return setup(data,
                 normalize=normalize,  # Normalize the data if specified
                 transformation=transformation,  # Apply data transformation if specified
                 numeric_imputation=numeric_imputation,  # Impute missing values numerically with mean or specified method
                 transformation_method=transformation_method)  # Use specified transformation method

# Function to detect outliers in a rolling window fashion
def window_outliers(data, window_size=15, step_size=5, method='lowest_quantile', num_lowest_points=3):
    """
    Detects outliers using a rolling window approach and returns indices of detected outliers.
    
    Args:
    data (pd.Series): The data series to analyze for outliers.
    window_size (int): The size of the rolling window (default is 15).
    step_size (int): The step size for the rolling window (default is 5).
    method (str): Method to use for outlier detection ('lowest_quantile', 'both_quantiles', or 'lowest_points').
    num_lowest_points (int): Number of lowest points to consider as outliers if using 'lowest_points' method.

    Returns:
    list: List of indices corresponding to detected outliers.
    """
    outliers = []  # List to store detected outlier indices
    for i in range(0, len(data) - window_size + 1, step_size):
        window = data.iloc[i:i + window_size]  # Get the data window of specified size
        if method == 'lowest_quantile':
            q1 = window.quantile(0.25)  # Calculate the first quartile
            outlier_indices = window[window < q1].index  # Detect outliers below Q1
        elif method == 'both_quantiles':
            q1 = window.quantile(0.25)
            q3 = window.quantile(0.75)  # Calculate the third quartile
            outlier_indices = window[(window < q1) | (window > q3)].index  # Detect outliers below Q1 or above Q3
        elif method == 'lowest_points':
            outlier_indices = window.nsmallest(num_lowest_points).index  # Detect the smallest points as outliers
        outliers.extend(outlier_indices)  # Add detected indices to the outlier list
    return list(set(outliers))  # Return unique outlier indices

# Function to count files with repeated IDs in a directory
def count_repeated_files(directory):
    """
    Counts the occurrence of files with repeated IDs in the specified directory.
    
    Args:
    directory (str): The directory path to search for files.

    Returns:
    Counter: A Counter object with the counts of each unique ID.
    """
    pattern = re.compile(r'(?:gas_|oil_)([^_]+)_')  # Regex pattern to extract ID from filenames
    counts = Counter()  # Counter to keep track of occurrences of each ID
    for file in os.listdir(directory):  # Iterate over each file in the directory
        match = pattern.search(file)  # Search for pattern in filenames
        if match:
            counts[match.group(1)] += 1  # Increment counter for the matched ID
    for key, count in counts.items():
        print(f"ID: {key} - Count: {count}")  # Print each ID and its count
    return counts  # Return the counter

# Function to get hyperparameters for anomaly detection algorithms from user input
def get_hyperparameters(algorithm_choice, default_hyperparameters):
    """
    Prompts user to input custom hyperparameters for the selected anomaly detection algorithm(s).
    
    Args:
    algorithm_choice (str): The algorithm chosen by the user ('all' or a specific algorithm).
    default_hyperparameters (dict): Default hyperparameters for the algorithms.

    Returns:
    dict: A dictionary of user-specified hyperparameters.
    """
    hyperparameters = {}  # Dictionary to store user-specified hyperparameters
    if algorithm_choice == 'all':
        print("\nDefault hyperparameters for all algorithms:")
        for algo, params in default_hyperparameters.items():
            print(f"{algo}: {params}")  # Display default hyperparameters for each algorithm
        use_defaults = input("Would you like to use these default hyperparameters? (yes/no) (default is yes): ").lower() == 'yes'
        if not use_defaults:
            for algo, params in default_hyperparameters.items():
                print(f"\nEnter hyperparameters for {algo}:")
                hyperparameters[algo] = {}  # Initialize dictionary for each algorithm
                for param, value in params.items():
                    new_value = input(f"{param} (default={value}): ")  # Prompt user for hyperparameter value
                    hyperparameters[algo][param] = type(value)(new_value) if new_value else value  # Convert input type
    else:
        print(f"\nDefault hyperparameters for {algorithm_choice}: {default_hyperparameters[algorithm_choice]}")
        for param, value in default_hyperparameters[algorithm_choice].items():
            new_value = input(f"{param} (default={value}): ")  # Prompt user for hyperparameter value
            hyperparameters[param] = type(value)(new_value) if new_value else value  # Convert input type
    return hyperparameters  # Return the hyperparameters

# Main function to detect and remove outliers from CSV files in a directory
def detect_and_remove_outliers(directory, algorithm_choice, hyperparameters=None, use_window=True, window_size=15, step_size=5, drop_repeated=True, window_method='lowest_quantile', num_lowest_points=3):
    """
    Detects and removes outliers from CSV files in the specified directory using PyCaret and/or a window method.
    
    Args:
    directory (str): The directory path containing CSV files to process.
    algorithm_choice (str): The anomaly detection algorithm(s) to use ('all' or specific algorithm).
    hyperparameters (dict): Hyperparameters for the selected algorithm(s).
    use_window (bool): Whether to use the rolling window method for initial outlier detection.
    window_size (int): The size of the rolling window (default is 15).
    step_size (int): The step size for the rolling window (default is 5).
    drop_repeated (bool): Whether to drop files with repeated IDs.
    window_method (str): The method for detecting window outliers ('lowest_quantile', 'both_quantiles', 'lowest_points').
    num_lowest_points (int): Number of lowest points to remove if using 'lowest_points' method.

    Returns:
    list: A list of results containing the original, cleaned data, and outlier information.
    """
    counts = count_repeated_files(directory)  # Count repeated files by ID
    repeated_ids = {key for key, count in counts.items() if count > 1}  # Extract repeated IDs
    processed_keywords = ['lof', 'abod', 'cluster', 'knn', 'cof', 'iforest']  # Keywords indicating processed files
    files = [f for f in os.listdir(directory) if 'clear' in f and f.endswith('.csv') and not any(keyword in f for keyword in processed_keywords)]  # Filter files to select files to apply noise removing
    if drop_repeated:
        files = [f for f in files if not any(repeated_id in f for repeated_id in repeated_ids)]  # Remove repeated files
    results = []  # List to store results
    
    # Define default hyperparameters for each algorithm
    default_hyperparameters = {
        'knn': {'fraction': 0.1, 'n_neighbors': 20},
        'lof': {'fraction': 0.1, 'n_neighbors': 20},
        'abod': {'fraction': 0.1, 'n_neighbors': 3},
        'cof': {'fraction': 0.1, 'n_neighbors': 3},
        'cluster': {'fraction': 0.1, 'n_clusters': 3},
        'iforest': {'fraction': 0.1, 'n_estimators': 200}
    }
    
    for file in files:
        file_path = os.path.join(directory, file)  # Full path of the file
        data = pd.read_csv(file_path)  # Load data from CSV
        if data.shape[1] < 2:
            print(f"Skipping {file} as it has less than two columns.")  # Skip files with less than 2 columns
            continue
        target = data.columns[1]  # Assume the second column is the target for anomaly detection
        print(f"Processing file: {file}")
        print(f"Data shape: {data.shape}")
        print(data.head())  # Display the first few rows of the data
        
        if use_window:
            outlier_indices = window_outliers(data[target], window_size, step_size, window_method, num_lowest_points)  # Detect outliers using the window method
            window_outliers_data = data.loc[outlier_indices]  # Data corresponding to window outliers
            data = data.drop(index=outlier_indices)  # Remove outliers from the data
            print(f"{file} - Window: Removed {len(outlier_indices)} outliers")  # Log the number of removed outliers
        else:
            window_outliers_data = pd.DataFrame(columns=data.columns)  # Initialize empty DataFrame if window method is not used
        
        if algorithm_choice == 'window':
            cleaned_data = data  # Use data directly if only the window method is chosen
            results.append((file, data, cleaned_data, window_outliers_data, data.columns[0], target, 'window', len(outlier_indices) / len(data) * 100))  # Add results
        else:
            exp = setup_pycaret(data)  # Setup PyCaret for anomaly detection
            selected_algorithms = ['lof', 'abod', 'cluster', 'knn', 'cof', 'iforest'] if algorithm_choice == 'all' else [algorithm_choice]  # Select algorithms
            all_outliers_indices = []  # List to store outlier indices for all algorithms
            
            for algo in selected_algorithms:
                try:
                    # Retrieve hyperparameters for the current algorithm, use defaults if not provided
                    params = hyperparameters.get(algo, default_hyperparameters[algo]) if algorithm_choice == 'all' else hyperparameters
                    
                    # Create and assign model with specified hyperparameters
                    model = create_model(algo, **params)
                    result = assign_model(model)
                    
                    # Detect outliers
                    outliers = result[result['Anomaly'] == 1]
                    outlier_indices = outliers.index
                    
                    # Add detected outlier indices to the list
                    all_outliers_indices.append(set(outlier_indices))
                    print(f"{file} - {algo}: Found {len(outlier_indices)} outliers")
                except Exception as e:
                    print(f"Error processing {file} with {algo}: {e}")
            
            # Calculate common outliers if 'all' algorithms are used
            if algorithm_choice == 'all':
                if all_outliers_indices:
                    common_outliers = set.intersection(*all_outliers_indices)
                else:
                    common_outliers = set()
                outlier_percent = len(common_outliers) / len(data) * 100
                cleaned_data = data.drop(index=common_outliers)
                results.append((file, data, cleaned_data, window_outliers_data, data.columns[0], target, 'all', outlier_percent))
            else:
                for algo, outlier_indices in zip(selected_algorithms, all_outliers_indices):
                    cleaned_data = data.drop(index=outlier_indices)
                    results.append((file, data, cleaned_data, window_outliers_data, data.columns[0], target, algo, len(outlier_indices) / len(data) * 100))
    
    return results  # Return results

# Function to plot the results of outlier detection and removal
def plot_results(results):
    """
    Plots the original, cleaned, and window-detected outliers for each processed file.
    
    Args:
    results (list): List of tuples, each containing information about original data, cleaned data, and outliers.
    """
    for file, data, cleaned_data, window_outliers_data, x_col, y_col, algo, outlier_percent in results:
        plt.figure(figsize=(10, 5))  # Set figure size
        plt.scatter(data[x_col], data[y_col], color='blue', label='Commonly Detected Outliers by All Algorithms')  # Plot original data
        plt.scatter(window_outliers_data[x_col], window_outliers_data[y_col], color='green', label='Window Outliers')  # Plot window outliers
        plt.scatter(cleaned_data[x_col], cleaned_data[y_col], color='red', label='Cleaned Data')  # Plot cleaned data
        plt.xlabel(x_col)  # Set x-axis label
        plt.ylabel(y_col)  # Set y-axis label
        plt.yscale("log")  # Set y-axis to logarithmic scale
        plt.title(f'Outlier Removal using {algo}')  # Set plot title
        plt.legend()  # Show legend
        plt.show()  # Display plot
        print(f"File: {file} - {algo}: Removed {outlier_percent:.2f}% of data as outliers")  # Log outlier removal info

# Function to handle user choices for plotting and saving results
def handle_user_choice(choice, results, directory):
    """
    Handles user choice for plotting and/or saving cleaned data after outlier detection.
    
    Args:
    choice (int): User's choice for action (1 = plot only, 2 = plot and save, 3 = save only).
    results (list): List of tuples, each containing information about original data, cleaned data, and outliers.
    directory (str): Path to the directory where cleaned files will be saved.
    """
    if choice == 1:
        plot_results(results)  # Plot results only
    elif choice == 2:
        plot_results(results)  # Plot and save results
        for file, data, cleaned_data, window_outliers_data, x_col, y_col, algo, outlier_percent in results:
            cleaned_file_name = f"{os.path.splitext(file)[0]}_{algo}.csv"  # Generate new filename for cleaned data
            cleaned_file_path = os.path.join(directory, cleaned_file_name)  # Full path for the cleaned file
            cleaned_data.to_csv(cleaned_file_path, index=False)  # Save cleaned data to CSV
    elif choice == 3:
        for file, data, cleaned_data, window_outliers_data, x_col, y_col, algo, outlier_percent in results:
            cleaned_file_name = f"{os.path.splitext(file)[0]}_{algo}.csv"  # Generate new filename for cleaned data
            cleaned_file_path = os.path.join(directory, cleaned_file_name)  # Full path for the cleaned file
            cleaned_data.to_csv(cleaned_file_path, index=False)  # Save cleaned data to CSV
    else:
        print("Invalid choice. Please select 1, 2, or 3.")  # Handle invalid user choice

# Main execution starts here
directory = input("Enter the directory path where the CSV files exist: ")  # Prompt user for directory path

# Count repeated files in the specified directory
counts = count_repeated_files(directory)
total_files = len(os.listdir(directory))  # Total number of files in the directory
total_repeated_files = sum(count for count in counts.values() if count > 1)  # Total number of repeated files
total_non_repeated_files = total_files - total_repeated_files  # Total number of non-repeated files

# Display counts of files
print(f"\nTotal files in directory: {total_files}")
print(f"Total repeated files: {total_repeated_files}")
print(f"Total non-repeated files: {total_non_repeated_files}")

# Prompt user for choices regarding file processing
drop_repeated_choice = int(input("\nEnter 1 to drop repeated files or 2 to process all files (default is 1, i.e. drop repeated files): ") or 2)  # User choice for dropping repeated files
use_window = input("Would you like to use Window outliers before any other algorithms? (yes/no) (default yes): ").lower() == 'yes'  # User choice for using window method

# If using the window method, prompt for parameters
if use_window:
    window_size = int(input("Enter the window size (default is 15 points): ") or 15)  # Prompt for window size
    step_size = int(input("Enter the step size (default is 5 point): ") or 5)  # Prompt for step size

    print("Choose the method for window outliers detection:")
    print("1. Drop the lowest quantile in the window")
    print("2. Drop both the highest and lowest quantiles in the window")
    print("3. Drop the lowest point(s) in the window")
    window_method_choice = int(input("Enter 1 to drop only the lowest quantile in the window, 2 to drop both the lowest and highest quantiles in the window, or 3 to drop the lowest point/s in the window (default is 3): ") or 3)  # User choice for window method

    if window_method_choice == 1:
        window_method = 'lowest_quantile'
        num_lowest_points = 1  # Default value, not used in this method
    elif window_method_choice == 2:
        window_method = 'both_quantiles'
        num_lowest_points = 1  # Default value, not used in this method
    elif window_method_choice == 3:
        window_method = 'lowest_points'
        num_lowest_points = int(input("Enter the number of lowest points to remove in each window (default is 3): ") or 3)  # Prompt for number of lowest points
    else:
        raise ValueError("Invalid choice for window outliers detection method")  # Handle invalid method choice
else:
    window_size = 15
    step_size = 5
    window_method = 'lowest_quantile'  # Default value, not used if use_window is False
    num_lowest_points = 3  # Default value, not used if use_window is False

# Prompt user for algorithm choice
algorithm_choice = input("Choose the algorithm(s) from ['knn', 'lof', 'abod', 'cof', 'cluster', 'iforest'] or type 'all'): ").lower()

# Define default hyperparameters for each algorithm
default_hyperparameters = {
    'knn': {'fraction': 0.10, 'n_neighbors': 20},
    'lof': {'fraction': 0.10, 'n_neighbors': 20},
    'abod': {'fraction': 0.10, 'n_neighbors': 3},
    'cof': {'fraction': 0.10, 'n_neighbors': 3},
    'cluster': {'fraction': 0.10, 'n_clusters': 3},
    'iforest': {'fraction': 0.10, 'n_estimators': 200}
}

# Get hyperparameters based on user input
hyperparameters = get_hyperparameters(algorithm_choice, default_hyperparameters)

# Prompt user for choice of plotting and saving results
user_choice = int(input("Enter 1 for plot only, 2 for plot and save, or 3 for save only (default is 1, i.e. show plots only): ") or 1)

# Determine whether to drop repeated files
drop_repeated = (drop_repeated_choice == 1)

# Detect and remove outliers from the specified directory
results = detect_and_remove_outliers(directory, algorithm_choice, hyperparameters, use_window, window_size, step_size, drop_repeated, window_method, num_lowest_points)

# Handle user choice for plotting and saving results
handle_user_choice(user_choice, results, directory)
