In [None]:
import pandas as pd
import numpy as np
import re
import os
import seaborn as sns
import matplotlib.pyplot as plt
from typing import List
from scipy.interpolate import interp1d, UnivariateSpline



# Function to load the dataset
def load_dataset():
    try:
        file_path = input("Enter the file path of the dataset: ").strip()
        file_type = file_path.split('.')[-1].lower()

        if file_type == 'csv':
            df = pd.read_csv(file_path)
        elif file_type in ['xls', 'xlsx']:
            df = pd.read_excel(file_path)
        elif file_type == 'txt':
            df = pd.read_csv(file_path, delimiter='\t')  # Assumes tab-delimited text file
        elif file_type == 'json':
            df = pd.read_json(file_path)
        else:
            print("Unsupported file type. Please provide a CSV, Excel, TXT, or JSON file.")
            return None
    except FileNotFoundError:
        print("The specified file was not found. Please check the path and try again.")
        return None
    except pd.errors.EmptyDataError:
        print("The file is empty. Please provide a valid file.")
        return None
    except pd.errors.ParserError:
        print("There was an error parsing the file. Please check the file format and content.")
        return None
    except Exception as e:
        print(f"An error occurred while loading the dataset: {e}")
        return None

    print("Dataset loaded successfully.")
    return df


# Function to create a backup of the original dataset
def create_backup(df):
    try:
        backup_df = df.copy()
        return backup_df
    except Exception as e:
        print(f"An error occurred while creating a backup: {e}")
        return None


#Function to display basic info
def display_basic_info(df):
    try:
        print("\nBasic Information about the Dataset:")
        print(df.info())
    except Exception as e:
        print(f"An error occurred while displaying basic information: {e}")


#Function to display head and tail
def show_head_and_tail(df, rows=5, display_choice='both'):
    try:
        if display_choice == 'head':
            print(f"\nFirst {rows} rows of the dataset:")
            print(df.head(rows))
        elif display_choice == 'tail':
            print(f"\nLast {rows} rows of the dataset:")
            print(df.tail(rows))
        elif display_choice == 'both':
            print(f"\nFirst {rows} rows of the dataset:")
            print(df.head(rows))
            print(f"\nLast {rows} rows of the dataset:")
            print(df.tail(rows))
        else:
            print("Invalid choice. Please select 'head', 'tail', or 'both'.")
    
    except AttributeError as e:
        print("Error: The provided object is not a valid DataFrame. Ensure it is properly initialized.")
        print(f"Details: {e}")
    except ValueError as e:
        print(f"ValueError: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")


#Function to display missing value
def check_missing_values(df):
    try:
        missing_info = df.isnull().sum()
        missing_columns = missing_info[missing_info > 0]
        if not missing_columns.empty:
            print("\nThe following columns contain missing values:")
            print(missing_columns)
    except Exception as e:
        print(f"An error occurred while checking for missing values: {e}")
 

    
#Function to check missing value
def check_inconsistencies(df):
    try:
        print("\nChecking for inconsistencies in numeric and categorical columns...")
        for col in df.columns:
            try:
                if pd.api.types.is_numeric_dtype(df[col]):
                    if (df[col] < 0).any() or df[col].isnull().any():
                        print(f"Column '{col}' may have inconsistencies such as negative values or nulls.")
                elif pd.api.types.is_string_dtype(df[col]):
                    if df[col].str.contains('error', case=False, na=False).any():
                        print(f"Column '{col}' contains potential errors like 'error' values.")
            except Exception as e:
                print(f"An error occurred while checking column '{col}': {e}")
    except Exception as e:
        print(f"An error occurred while checking for inconsistencies: {e}")
 


#Function to remove columns
def remove_columns_by_index(df):
    try:
        print("\nCurrent columns in the dataset:")
        for index, col in enumerate(df.columns):
            print(f"{index}: {col}")

        remove_columns_indices = input("Do you want to remove any columns by index? (yes/no): ").strip().lower()
        if remove_columns_indices == 'yes':
            try:
                indices_to_remove = input("Enter the column indices to remove, separated by commas: ").strip().split(',')
                indices_to_remove = [int(index.strip()) for index in indices_to_remove]
                columns_to_remove = [df.columns[i] for i in indices_to_remove if i < len(df.columns)]
                if columns_to_remove:
                    df.drop(columns=columns_to_remove, errors='ignore', inplace=True)
                    print(f"Columns {columns_to_remove} have been removed.")
                else:
                    print("No valid column indices provided for removal.")
            except ValueError:
                print("Please enter valid numeric indices.")
            except Exception as e:
                print(f"An error occurred while removing columns: {e}")
    except Exception as e:
        print(f"An error occurred during column removal: {e}")



#Function to drop duplicate
def check_and_handle_duplicates(df1):
    df = df1
    try:
        # Check for duplicates
        duplicate_rows = df.duplicated().sum()
        if duplicate_rows > 0:
            print(f"\nYour dataset contains {duplicate_rows} duplicate rows.")
            
            # Ask the user if they want to drop the duplicates
            drop_choice = input("Do you want to drop these duplicate rows? (yes/no): ").strip().lower()
            if drop_choice == 'yes':
                print(f"Initial shape of the DataFrame: {df.shape}")
                df.drop_duplicates(inplace=True)
                print(f"Shape after dropping duplicates: {df.shape}")
            else:
                print("No duplicates were dropped.")
        else:
            print("Your dataset contains no duplicate rows.")
    except Exception as e:
        print(f"An error occurred while checking or dropping duplicates: {e}")
        df = df1

    return df


#Function to drop missing rows
def drop_missing_rows(df, subset):
    try:
        # Check for rows with missing values in the specified columns
        missing_rows = df[df[subset].isnull().any(axis=1)]
        
        if not missing_rows.empty:
            print("Rows with missing values in specified columns:")
            print(missing_rows)
            
            # Show a warning message
            print("\nWarning: Dropping rows with missing values may affect other columns.")
            
            # Get the indices of the rows that have missing values
            row_indices = missing_rows.index.tolist()
            print(f"Row indices with missing values: {row_indices}")
            
            # Ask the user if they want to proceed with dropping these rows
            user_input = input("Do you want to drop these rows? (yes/no): ").strip().lower()
            if user_input == 'yes':
                # Drop rows with missing values in the specified columns
                cleaned_df = df.dropna(subset=subset)
                print(f"Rows with missing values in columns {subset} have been dropped.")
            else:
                print("No rows were dropped.")
                cleaned_df = df
        else:
            print("No rows with missing values found in the specified columns.")
            cleaned_df = df
        
        return cleaned_df
    
    except Exception as e:
        print(f"An error occurred while dropping rows with missing values: {e}")
        return df



#Function to handle missing values
def visualize_column_distribution(df: pd.DataFrame, column: str):
    try:
        plt.figure(figsize=(14, 6))

        # Histogram of non-missing values
        plt.subplot(1, 2, 1)
        sns.histplot(df[column].dropna(), kde=True, bins=30)
        plt.title(f'Histogram of {column}')
        plt.xlabel(column)
        plt.ylabel('Frequency')

        # Box plot of non-missing values
        plt.subplot(1, 2, 2)
        sns.boxplot(x=df[column].dropna())
        plt.title(f'Box Plot of {column}')
        plt.xlabel(column)

        plt.tight_layout()
        plt.show()
    except Exception as e:
        print(f"An error occurred while visualizing column distribution: {e}")

def fill_numeric_missing_values(df: pd.DataFrame, column: str, strategy: str) -> pd.DataFrame:
    try:
        if strategy == 'mean':
            df[column] = df[column].fillna(df[column].mean())
        elif strategy == 'median':
            df[column] = df[column].fillna(df[column].median())
        elif strategy == 'mode':
            df[column] = df[column].fillna(df[column].mode()[0])
        elif strategy == 'zero':
            df[column] = df[column].fillna(0)
        elif strategy == 'forward':
            df[column] = df[column].ffill()
        elif strategy == 'backward':
            df[column] = df[column].bfill()
        elif strategy == 'whitespace':
            df[column] = df[column].fillna('')
        else:
            print(f"Unknown strategy '{strategy}' for numeric column '{column}'. Using mean as default.")
            df[column] = df[column].fillna(df[column].mean())
    except Exception as e:
        print(f"An error occurred while filling missing numeric values in column '{column}': {e}")
    return df

def fill_categorical_missing_values(df: pd.DataFrame, column: str, strategy: str) -> pd.DataFrame:
    try:
        if strategy == 'fill_value':
            fill_value = input(f"Enter a fill value for column '{column}' (e.g., 'Not available', 'Not disclosed'): ").strip()
            df[column] = df[column].fillna(fill_value)
        elif strategy == 'forward':
            df[column] = df[column].astype(str).ffill()
        elif strategy == 'backward':
            df[column] = df[column].astype(str).bfill()
        elif strategy == 'whitespace':
            df[column] = df[column].astype(str).fillna('')
        else:
            print(f"Unknown strategy '{strategy}' for categorical column '{column}'. Using 'whitespace' as default.")
            df[column] = df[column].astype(str).fillna('')
    except Exception as e:
        print(f"An error occurred while filling missing categorical values in column '{column}': {e}")
    return df

def drop_column(df: pd.DataFrame, column: str) -> pd.DataFrame:
    try:
        df.drop(columns=[column], inplace=True)
        print(f"Column '{column}' has been dropped.")
    except Exception as e:
        print(f"An error occurred while dropping column '{column}': {e}")
    return df


def handle_missing_values(df: pd.DataFrame) -> pd.DataFrame:
    try:
        # Identify columns with missing values
        missing_info = df.isnull().sum()
        columns_with_nan = missing_info[missing_info > 0]

        if columns_with_nan.empty:
            print("No columns with missing values.")
            return df

        print("\nColumns with missing values:")
        for index, (col, count) in enumerate(columns_with_nan.items()):
            column_type = 'Numeric' if pd.api.types.is_numeric_dtype(df[col]) else 'Categorical'
            print(f"{index}: {col} ({column_type}) - count: {count}")

        selected_indices = input("Enter the indices of the columns you want to handle, separated by commas: ").strip().split(',')
        selected_indices = [int(index.strip()) for index in selected_indices if index.strip().isdigit() and int(index.strip()) < len(columns_with_nan)]

        selected_columns = [columns_with_nan.index[i] for i in selected_indices]

        for col in columns_with_nan.index:
            try:
                if col in selected_columns:
                    while True:
                        print(f"\nProcessing column: '{col}'")
                        print("1. Drop column")
                        print("2. Handle missing values")
                        print("3. Exit")
                        choice = input("Choose an option: ").strip()
                        if choice == '1':
                            df = drop_column(df, col)
                            break
                        elif choice == '2':
                            if pd.api.types.is_numeric_dtype(df[col]):
                                visualize_column_distribution(df, col)  # Visualize the distribution
                                strategy = input(f"Column '{col}' is numeric with {missing_info[col]} missing values. Choose a strategy (mean/median/mode/zero/forward/backward/whitespace): ").strip().lower()
                                df = fill_numeric_missing_values(df, col, strategy)
                            else:
                                strategy = input(f"Column '{col}' is categorical with {missing_info[col]} missing values. Choose a strategy (fill_value/forward/backward/whitespace): ").strip().lower()
                                df = fill_categorical_missing_values(df, col, strategy)
                            break
                        
                        elif choice == '3':
                            break
                        else:
                            print("Invalid choice. Please select 1, 2, 3, or 4.")
                else:
                    # Apply default strategies for unselected columns
                    print(f"Filling unselected column '{col}' with default strategy.")
                    if pd.api.types.is_numeric_dtype(df[col]):
                        df = fill_numeric_missing_values(df, col, 'zero')
                    else:
                        df = fill_categorical_missing_values(df, col, 'fill_value')

            except Exception as e:
                print(f"An error occurred while handling missing values in column '{col}': {e}")

        # Display the count of missing values after processing
        print("\nMissing values count after processing:")
        print(df.isnull().sum())

    except Exception as e:
        print(f"An error occurred while handling missing values: {e}")

    return df



#Function for interpolate_missing_values
def visualize_data_pattern(df, column):
    plt.figure(figsize=(10, 6))
    plt.plot(df[column], marker='o', linestyle='--', label='Original Data')
    plt.title(f'Original Data with Missing Values in {column}')
    plt.xlabel('Index')
    plt.ylabel(column)
    plt.legend()
    plt.show()

    print("\nUnderstanding the Nature of Your Data:")
    print("1. Linear Data: If your data follows a straight-line trend or you expect a linear relationship between data points, linear interpolation is generally sufficient.")
    print("2. Non-Linear Data: If your data has curves, peaks, or non-linear trends, a polynomial or spline interpolation might be more appropriate.")
    print("3. Smooth Continuous Data: If your data is expected to be smooth and continuous, such as in physical measurements or time series data, spline interpolation often works well.")

def plot_interpolation_comparison(df, column):
    try:
        # Original Data
        plt.figure(figsize=(14, 10))
        plt.subplot(2, 2, 1)
        plt.plot(df[column], marker='o', linestyle='--', label='Original Data')
        plt.title('Original Data with Missing Values')
        plt.xlabel('Index')
        plt.ylabel(column)
        plt.legend()
        
        # Linear Interpolation
        df_linear = df.copy()
        df_linear[column] = df_linear[column].interpolate(method='linear')
        plt.subplot(2, 2, 2)
        plt.plot(df_linear[column], marker='o', linestyle='-', color='r', label='Linear Interpolation')
        plt.title('Linear Interpolation')
        plt.xlabel('Index')
        plt.ylabel(column)
        plt.legend()
        
        # Polynomial Interpolation (2nd degree)
        df_poly = df.copy()
        poly_interp = interp1d(df.index[~df[column].isnull()], df[column].dropna(), kind='quadratic', fill_value='extrapolate')
        df_poly[column] = poly_interp(df.index)
        plt.subplot(2, 2, 3)
        plt.plot(df_poly[column], marker='o', linestyle='-', color='g', label='Polynomial Interpolation (2nd Degree)')
        plt.title('Polynomial Interpolation')
        plt.xlabel('Index')
        plt.ylabel(column)
        plt.legend()
        
        # Spline Interpolation
        df_spline = df.copy()
        spline_interp = interp1d(df.index[~df[column].isnull()], df[column].dropna(), kind='cubic', fill_value='extrapolate')
        df_spline[column] = spline_interp(df.index)
        plt.subplot(2, 2, 4)
        plt.plot(df_spline[column], marker='o', linestyle='-', color='b', label='Spline Interpolation')
        plt.title('Spline Interpolation')
        plt.xlabel('Index')
        plt.ylabel(column)
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
        print(f"Based on the visualizations, consider the following:")
        print("1. Linear Interpolation: Useful for simple, linear trends.")
        print("2. Polynomial Interpolation: Good for capturing more complex trends.")
        print("3. Spline Interpolation: Useful for smooth, continuous data.")
    
    except Exception as e:
        print(f"An error occurred while plotting and suggesting interpolation methods: {e}")

def interpolate_missing_values(df):
    try:
        # Identify columns with missing values
        missing_info = df.isnull().sum()
        columns_with_nan = missing_info[missing_info > 0]

        if columns_with_nan.empty:
            print("No columns with missing values.")
            return df

        print("\nColumns with missing values:")
        for index, (col, count) in enumerate(columns_with_nan.items()):
            print(f"{index}: {col} - count: {count}")

        selected_index = int(input("Enter the index of the column you want to interpolate: ").strip())
        column_to_interpolate = columns_with_nan.index[selected_index]

        # Visualize data pattern and suggest methods
        visualize_data_pattern(df, column_to_interpolate)

        # Ask if the user wants to proceed with interpolation
        proceed = input(f"Do you want to perform interpolation on the column '{column_to_interpolate}'? (yes/no): ").strip().lower()
        if proceed == 'yes':
            plot_interpolation_comparison(df, column_to_interpolate)
            method = input(f"Choose an interpolation method for column '{column_to_interpolate}' (linear, polynomial, spline): ").strip().lower()

            if method == 'linear':
                df[column_to_interpolate] = df[column_to_interpolate].interpolate(method='linear')
            elif method == 'polynomial':
                df[column_to_interpolate] = df[column_to_interpolate].interpolate(method='polynomial', order=2)
            elif method == 'spline':
                df[column_to_interpolate] = df[column_to_interpolate].interpolate(method='spline', order=3)
            else:
                print("Invalid method selected. No interpolation performed.")
        else:
            print("Interpolation skipped.")
        
    except Exception as e:
        print(f"An error occurred during interpolation: {e}")

    return df



#Detect and Fix inaccurate data
def detect_and_fix_inaccurate_data(df, column_index=None):
    """
    Detect and fix inaccurate data in the DataFrame based on column index.
    If column_index is None, process all columns.

    Args:
    df: The DataFrame to process.
    column_index (int, optional): The index of the column to process. If None, all columns are processed.

    Returns:
    The DataFrame with inaccurate data fixed.
    """
    try:
        # If column index is not provided, process all columns
        if column_index is None:
            for col in df.columns:
                try:
                    if pd.api.types.is_numeric_dtype(df[col]):
                        # Detect outliers
                        lower_bound = df[col].quantile(0.01)
                        upper_bound = df[col].quantile(0.99)
                        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                        if not outliers.empty:
                            print(f"Inaccurate data found in numeric column '{col}': {outliers.shape[0]} outliers.")
                            # Replace outliers with the median
                            df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = df[col].median()
                            print(f"Outliers in '{col}' have been replaced with the median value.")
                    
                    elif pd.api.types.is_string_dtype(df[col]):
                        # Detect and fix inconsistencies
                        inconsistent = df[col].str.contains('error', case=False, na=False)
                        if inconsistent.any():
                            print(f"Inaccurate data found in string column '{col}': {inconsistent.sum()} potential errors.")
                            # Replace 'error' with 'Unknown'
                            df[col] = df[col].str.replace('error', 'Unknown', case=False)
                            print(f"'Error' entries in '{col}' have been replaced with 'Unknown'.")
                
                except Exception as e:
                    print(f"An error occurred while checking or fixing column '{col}': {e}")

        # If column index is provided, process only that column
        else:
            if column_index < 0 or column_index >= len(df.columns):
                print(f"Column index {column_index} is out of range. Please provide a valid index.")
                return df
            
            col = df.columns[column_index]
            try:
                if pd.api.types.is_numeric_dtype(df[col]):
                    # Detect outliers
                    lower_bound = df[col].quantile(0.01)
                    upper_bound = df[col].quantile(0.99)
                    outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]
                    if not outliers.empty:
                        print(f"Inaccurate data found in numeric column '{col}': {outliers.shape[0]} outliers.")
                        # Replace outliers with the median
                        df.loc[(df[col] < lower_bound) | (df[col] > upper_bound), col] = df[col].median()
                        print(f"Outliers in '{col}' have been replaced with the median value.")
                
                elif pd.api.types.is_string_dtype(df[col]):
                    # Detect and fix inconsistencies
                    inconsistent = df[col].str.contains('error', case=False, na=False)
                    if inconsistent.any():
                        print(f"Inaccurate data found in string column '{col}': {inconsistent.sum()} potential errors.")
                        # Replace 'error' with 'Unknown'
                        df[col] = df[col].str.replace('error', 'Unknown', case=False)
                        print(f"'Error' entries in '{col}' have been replaced with 'Unknown'.")
                
            except Exception as e:
                print(f"An error occurred while checking or fixing column '{col}': {e}")

    except Exception as e:
        print(f"An error occurred during data accuracy checking: {e}")

    return df


# Function to remove special characters and extra white spaces based on column index
def show_text_columns(df):
    """Display the indices and names of columns that contain text data."""
    text_columns = [index for index, dtype in enumerate(df.dtypes) if pd.api.types.is_string_dtype(dtype) or pd.api.types.is_object_dtype(dtype)]
    
    if text_columns:
        print("Text columns available for cleaning:")
        for index in text_columns:
            print(f"Index: {index}, Column Name: {df.columns[index]}")
    else:
        print("No text columns found in the dataset.")
    
    return text_columns

def clean_text_columns_by_index(df, column_indices):
    try:
        # Ensure column_indices is a list even if a single index is provided
        if isinstance(column_indices, int):
            column_indices = [column_indices]
        
        for index in column_indices:
            try:
                if index < len(df.columns):  # Check if index is within the range of columns
                    # Convert the entire column to string type before cleaning
                    df.iloc[:, index] = df.iloc[:, index].astype(str)

                    # Store original column values for comparison
                    original_column = df.iloc[:, index].copy()

                    # Clean the text
                    df.iloc[:, index] = df.iloc[:, index].str.replace(r'[^A-Za-z0-9\s]', '', regex=True)  # Remove special characters
                    df.iloc[:, index] = df.iloc[:, index].str.strip()  # Remove leading and trailing white spaces
                    df.iloc[:, index] = df.iloc[:, index].str.replace(r'\s+', ' ', regex=True)  # Remove extra white spaces

                    # Check and print if any changes were made
                    changes_made = (original_column != df.iloc[:, index]).sum()
                    if changes_made > 0:
                        print(f"Column at index {index} has been cleaned. {changes_made} rows were modified.")
                    else:
                        print(f"Column at index {index} was already clean. No changes were made.")
                        
                    # Optionally, show a before and after comparison for the first few rows
                    print("\nSample of changes:")
                    comparison_df = pd.DataFrame({
                        'Original': original_column.head(5),
                        'Cleaned': df.iloc[:, index].head(5)
                    })
                    print(comparison_df)
                else:
                    print(f"Index {index} is out of range.")
            except Exception as e:
                print(f"An error occurred while cleaning text in column at index {index}: {e}")
    except Exception as e:
        print(f"An error occurred during text cleaning: {e}")

    return df



# Function to format phone numbers
def is_valid_phone_number(number):
    # Remove non-numeric characters except '+'
    number = re.sub(r'[^\d+]', '', number)
    
    # Normalize country code
    if number.startswith('00'):
        number = '+' + number[2:]
    
    # Validate length: local numbers (10 digits) or international numbers (13 digits)
    if len(number) == 10 or (len(number) == 13 and number.startswith('+')):
        return True
    return False

def format_phone_number(phone):
    # Split phone numbers if separated by common delimiters
    phone_numbers = re.split(r'[\/,;or\s]+', phone)
    
    formatted_numbers = []
    for number in phone_numbers:
        # Remove non-numeric characters except '+'
        number = re.sub(r'[^\d+]', '', number)
        
        # Normalize country code
        if number.startswith('00'):
            number = '+' + number[2:]
        
        # Validate and add formatted number to the list
        if is_valid_phone_number(number):
            formatted_numbers.append(number)
    
    # Return both primary and alternate numbers, if available
    if len(formatted_numbers) > 1:
        return formatted_numbers[0], ', '.join(formatted_numbers[1:])
    elif formatted_numbers:
        return formatted_numbers[0], None
    else:
        return None, None

def process_phone_numbers(df, phone_col_index):
    phone_col_name = df.columns[phone_col_index]
    
    # Apply formatting function to the phone number column
    df[['Primary Number', 'Alternate Number']] = df[phone_col_name].astype(str).apply(lambda x: pd.Series(format_phone_number(x)))
    
    # Drop the original phone number column
    df = df.drop(columns=[phone_col_name])
    
    # Return the DataFrame with the new columns
    return df 



# Function to format names into first, middle, last name by column index
def format_names(df, name_column_index):
    try:
        # Ensure the column index is within range
        if name_column_index < 0 or name_column_index >= len(df.columns):
            print(f"Column index {name_column_index} is out of range. Please provide a valid index.")
            return df
        
        name_column = df.columns[name_column_index]

        # Check if the column exists and is of string type
        if name_column not in df.columns or not pd.api.types.is_string_dtype(df[name_column]):
            print(f"Column '{name_column}' either does not exist or is not a string type.")
            return df
        
        # Extract first, middle, and last names from the name column
        df[['First Name', 'Middle Name', 'Last Name']] = df[name_column].str.extract(r'^(\w+)\s+(\w+)?\s*(\w*)$', expand=True)
        
        # Handle missing middle names by setting them to NaN
        df['Middle Name'] = df['Middle Name'].replace('', np.nan)

        print(f"Names in column '{name_column}' have been formatted into 'First Name', 'Middle Name', and 'Last Name'.")

    except Exception as e:
        print(f"An error occurred while formatting names in column index {name_column_index}: {e}")

    return df



# Function to format date columns by column index
def format_date_columns(df):
    try:
        # Identify all columns with date-like data
        date_columns = df.select_dtypes(include=[pd.DatetimeTZDtype, 'datetime64[ns]']).columns
        
        if date_columns.empty:
            print("No date columns found in the dataset.")
            return df
        
        print("\nDate columns identified:")
        for index, col in enumerate(date_columns):
            print(f"{index}: {col}")

        # Ask user to select columns to format
        indices = input("Enter the indices of the date columns to format, separated by commas: ").strip().split(',')
        indices = [int(index.strip()) for index in indices if index.strip().isdigit()]
        
        # Validate indices
        invalid_indices = [i for i in indices if i >= len(date_columns) or i < 0]
        if invalid_indices:
            print(f"Invalid indices provided: {', '.join(map(str, invalid_indices))}. These indices will be ignored.")
        
        valid_indices = [i for i in indices if i < len(date_columns) and i >= 0]
        if not valid_indices:
            print("No valid indices provided. No columns will be formatted.")
            return df

        # Extract selected date columns
        selected_date_columns = [date_columns[i] for i in valid_indices]

        # Ask user for the desired date format
        date_format = input("Enter the desired date format (e.g., 'YYYY-MM-DD', 'MM/DD/YYYY', etc.): ").strip()
        
        if not date_format:
            print("No date format provided. Skipping date formatting.")
            return df
        
        # Apply the format to each selected date column
        for col in selected_date_columns:
            df[col] = df[col].dt.strftime(date_format)
            print(f"Column '{col}' formatted to '{date_format}'.")

    except Exception as e:
        print(f"An error occurred while formatting date columns: {e}")

    return df



#Function to format address
def parse_address(address):
    # Split the address into components
    address_parts = address.split(',')

    # Initialize the components
    house_number, area, city = None, None, None
    
    # Extract house number (assuming it's the first part)
    if len(address_parts) > 0:
        first_part = address_parts[0].strip()
        # Check if the first part contains only digits
        if first_part.isdigit():
            house_number = first_part
        else:
            house_number = None

    # Extract area (everything between house number and city)
    if len(address_parts) > 1:
        area = ', '.join([part.strip() for part in address_parts[1:-1]])
    
    # Extract city (assuming it's the last part)
    if len(address_parts) > 2:
        city = address_parts[-1].strip()

    return house_number, area, city

def process_addresses(df, address_col_index):
    address_col_name = df.columns[address_col_index]
    
    # Apply address parsing function
    df[['House Number', 'Area', 'City']] = df[address_col_name].astype(str).apply(lambda x: pd.Series(parse_address(x)))
    
    # Drop the original address column if needed
    df = df.drop(columns=[address_col_name])
    
    # Return the DataFrame with the new columns
    return df 



# Function to standardize text data
def standardize_text_data(df, remove_special_chars=False):
    try:
        for col in df.select_dtypes(include=['object']).columns:
            df[col] = df[col].str.lower().str.strip()  # Convert to lowercase and remove extra spaces
            if remove_special_chars:
                df[col] = df[col].str.replace(r'[^\w\s]', '', regex=True)  # Remove special characters
            print(f"Text data in column '{col}' standardized.")
    except Exception as e:
        print(f"An error occurred while standardizing text data: {e}")
    return df


# Function to normalize numeric data
def normalize_numeric_data(df):
    try:
        # Get the normalization method from the user
        method = input("Enter the normalization method ('min-max' or 'z-score'): ").strip().lower()
        
        for col in df.select_dtypes(include=[np.number]).columns:
            if method == 'min-max':
                df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
                print(f"Column '{col}' normalized using Min-Max scaling.")
            elif method == 'z-score':
                df[col] = (df[col] - df[col].mean()) / df[col].std()
                print(f"Column '{col}' normalized using Z-score standardization.")
            else:
                print(f"Normalization method '{method}' is not recognized. Skipping column '{col}'.")
    except Exception as e:
        print(f"An error occurred while normalizing numeric data: {e}")
    return df


#Function to rename column name
def rename_columns(df):
    """
    Prompts the user to rename columns in the given DataFrame.

    Parameters:
    df (pd.DataFrame): The DataFrame whose columns are to be renamed.

    Returns:
    pd.DataFrame: The DataFrame with renamed columns.
    """
    # Display the current column names
    print("Current columns in the DataFrame:")
    print(list(df.columns))
   
    while True:
        # Prompt the user for the column to rename
        current_name = input("\nEnter the current column name you want to rename (or type 'exit' to finish): ").strip()
       
        if current_name.lower() == 'exit':
            break
       
        # Check if the column exists in the DataFrame
        if current_name not in df.columns:
            print(f"Error: Column '{current_name}' does not exist in the DataFrame.")
            continue
       
        # Prompt the user for the new column name
        new_name = input(f"Enter the new name for column '{current_name}': ").strip()
       
        # Rename the column
        df.rename(columns={current_name: new_name}, inplace=True)
       
        print(f"Column '{current_name}' has been renamed to '{new_name}'.")
        print("Updated columns in the DataFrame:")
        print(list(df.columns))
   
    return df


#Function to combine columns 
def get_columns_to_combine(df):
    """
    Prompts the user for columns to combine and validates the columns.
    """
    print(f"Columns in the dataset: {list(df.columns)}")
    columns_to_combine = input("Enter the columns to combine (comma-separated): ").split(',')
    columns_to_combine = [col.strip() for col in columns_to_combine]

    missing_columns = [col for col in columns_to_combine if col not in df.columns]
    if missing_columns:
        raise ValueError(f"Columns not found in DataFrame: {missing_columns}")

    return columns_to_combine

def get_user_format(columns_to_combine):
    """
    Prompts the user for the format and validates the number of placeholders.
    """
    user_format = input("Enter the format for combining the columns (e.g., '{0} {1}'): ")
   
    placeholders = user_format.count('{')
    if placeholders != len(columns_to_combine):
        raise ValueError(f"The format string requires {placeholders} placeholders, but {len(columns_to_combine)} columns were provided.")

    return user_format

def combine_columns(df, columns_to_combine, new_column_name, user_format):
    """
    Combines the specified columns in the DataFrame using the user-defined format.
    """
    df[new_column_name] = df[columns_to_combine].apply(lambda row: user_format.format(*row), axis=1)
    return df

def user_combine_columns(df):
    """
    Handles the user interaction for combining columns.
    """
    try:
        columns_to_combine = get_columns_to_combine(df)
        user_format = get_user_format(columns_to_combine)
        new_column_name = input("Enter the name for the new combined column: ").strip()

        df = combine_columns(df, columns_to_combine, new_column_name, user_format)
        print(f"Columns combined successfully into '{new_column_name}'.")
    except Exception as e:
        print(f"An error occurred: {e}")

    return df



# Appliying filter for column by using Condition 
import pandas as pd
import os

def filter_dataset_from_file(df):
    print("Available columns:", df.columns.tolist())
    
    while True:
        # Get column to filter
        column_name = input("Enter the column name you want to filter by (or 'exit' to finish): ")
        if column_name.lower() == 'exit':
            break

        # Ensure the column exists in the DataFrame
        if column_name not in df.columns:
            print(f"Column '{column_name}' not found. Please enter a valid column name.")
            continue
        
        # Get filter condition from user
        condition = input("Enter the condition (e.g., > 50000, <= 55000): ").strip()
        
        # Escape backticks in column names
        column_name = f"`{column_name}`"
        
        try:
            # Dynamically apply the filter using query
            df = df.query(f"{column_name} {condition}")
        except Exception as e:
            print(f"Error in applying filter: {e}")
            continue
        
        print(f"Filtered DataFrame:\n{df}\n")
        
        another_filter = input("Do you want to apply another filter? (yes/no): ").strip().lower()
        if another_filter != 'yes':
            break

    return df


#Function to convert data type from one format to another
import pandas as pd
from typing import Dict

def convert_data_types(df: pd.DataFrame, conversions: Dict[str, str]) -> pd.DataFrame:
    try:
        # Display available columns
        print("Available columns:", df.columns.tolist())
        
        while True:
            # Get the column name from the user
            column_name = input("Enter the column name you want to convert (or 'exit' to finish): ")
            if column_name.lower() == 'exit':
                break

            # Ensure the column exists in the DataFrame
            if column_name not in df.columns:
                print(f"Column '{column_name}' not found. Please enter a valid column name.")
                continue
            
            # Show current data type
            current_dtype = df[column_name].dtype
            print(f"Current data type of '{column_name}' is: {current_dtype}")
            
            # Get the desired data type from the user
            target_dtype = input("Enter the data type you want to convert to (e.g., 'int', 'float', 'str', 'datetime'): ").strip()

            try:
                # Perform the conversion
                df[column_name] = df[column_name].astype(target_dtype)
                conversions[column_name] = target_dtype
                print(f"Converted '{column_name}' to {target_dtype}.")
            except Exception as e:
                print(f"Error converting '{column_name}' to {target_dtype}: {e}")
                continue

            another_conversion = input("Do you want to convert another column? (yes/no): ").strip().lower()
            if another_conversion != 'yes':
                break

    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    
    return df


#Function for summary statistics
import pandas as pd
def generate_summary_statistics(df: pd.DataFrame):
    try:
        # Display the list of columns to the user
        print("Columns in the dataset:")
        for idx, col in enumerate(df.columns):
            print(f"{idx}: {col}")
        
        # Ask the user to input the column index they want to analyze
        col_idx = int(input("Enter the column index you want to see summary statistics for: "))
        
        # Validate the user's input
        if col_idx < 0 or col_idx >= len(df.columns):
            raise ValueError("Invalid column index. Please enter a valid index from the list.")
        
        # Get the column name based on the index
        col_name = df.columns[col_idx]
        
        # Generate and display summary statistics for the chosen column
        summary_stats = df[col_name].describe(include='all')
        print(f"\nSummary Statistics for '{col_name}':")
        print(summary_stats)
        
    except ValueError as ve:
        print(f"Value Error: {ve}")
    except Exception as e:
        print(f"An error occurred: {e}")


#Function to aggregate
import pandas as pd
def apply_aggregation(df: pd.DataFrame, group_by: str):
    try:
        # Check if the group_by column exists in the dataframe
        if group_by not in df.columns:
            raise ValueError(f"Column '{group_by}' does not exist in the DataFrame.")
        
        # Display the list of columns to the user
        print("Columns in the dataset:")
        for idx, col in enumerate(df.columns):
            print(f"{idx}: {col}")
        
        # Ask the user to select columns they want to aggregate (by index)
        selected_columns = input("Enter the column indices you want to aggregate (comma-separated): ")
        selected_columns = [df.columns[int(idx)] for idx in selected_columns.split(',')]
        
        # Display available aggregation functions
        available_functions = {
            'sum': 'Sum',
            'mean': 'Average',
            'count': 'Count',
            'min': 'Minimum',
            'max': 'Maximum'
        }
        print("Available aggregation functions:")
        for key, value in available_functions.items():
            print(f"{key}: {value}")
        
        # Ask the user to input aggregation functions (comma-separated)
        agg_functions = input("Enter the aggregation functions you want to apply (comma-separated): ").lower().split(',')
        
        # Build the aggregation dictionary
        aggregation_dict = {col: agg_functions for col in selected_columns}
        
        # Perform aggregation
        aggregated_df = df.groupby(group_by).agg(aggregation_dict)
        
        # Display the result
        print("\nAggregated Data:")
        print(aggregated_df)
        
        return aggregated_df
    
    except ValueError as ve:
        print(f"Value Error: {ve}")
    except Exception as e:
        print(f"An error occurred: {e}")


#Function to select row by value
import pandas as pd
def select_row_by_value(df: pd.DataFrame):
    """
    Select rows in the dataset where the value in a specified column matches the user's input.

    Parameters:
    - df (pd.DataFrame): The DataFrame to search within.
    """
    
    # Prompt user for column and value
    column_name = input("Please enter the name of the column you want to search in: ").strip()
    input_value = input("Please enter the value to search for: ").strip()

    # Parse the input value to its appropriate data type
    parsed_value = parse_value(input_value)

    # Select rows where the column value matches the parsed input value
    if column_name in df.columns:
        selected_rows = df[df[column_name] == parsed_value]

        if selected_rows.empty:
            print(f"No matching rows found for value: {parsed_value}")
        else:
            print("Matching rows found:")
            print(selected_rows)
    else:
        print(f"Column '{column_name}' does not exist in the dataset.")
        
def parse_value(input_value):
    """
    Parses the input value to its appropriate data type.
    
    Parameters:
    - input_value (str): The input value as a string.
    
    Returns:
    - Parsed value (int, float, str): The value converted to the most appropriate data type.
    """
    try:
        if '.' in input_value:
            return float(input_value)
        else:
            return int(input_value)
    except ValueError:
        return input_value



import pandas as pd

def sort_column(df: pd.DataFrame):
    """
    Sorts the DataFrame based on a specified column and order provided by the user.
    
    Parameters:
    - df (pd.DataFrame): The DataFrame to sort.
    
    Returns:
    - pd.DataFrame: The sorted DataFrame.
    """
    
    # Prompt user for column to sort by
    column_name = input("Please enter the name of the column to sort by: ").strip()
    
    if column_name not in df.columns:
        raise ValueError(f"Column '{column_name}' does not exist in the dataset.")
    
    # Prompt user for sort order
    sort_order = input("Enter 'asc' for ascending order or 'desc' for descending order: ").strip().lower()

    if sort_order not in ['asc', 'desc']:
        raise ValueError("Invalid sort order. Please enter 'asc' for ascending or 'desc' for descending.")
    
    ascending = True if sort_order == 'asc' else False

    # Sort the dataset
    sorted_dataset = df.sort_values(by=column_name, ascending=ascending)
    
    # Display the sorted DataFrame
    print(f"\nSorted DataFrame by '{column_name}' in {'ascending' if ascending else 'descending'} order:")
    print(sorted_dataset)
    
    return sorted_dataset


#Function to format currency
import pandas as pd
import re

def is_valid_currency_symbol(symbol):
    """Simple validation to check if the symbol is a valid currency type."""
    return len(symbol) in [1, 2, 3] and symbol.isalpha()

def convert_currency(amount, conversion_rate):
    """Converts the given amount using the provided conversion rate."""
    try:
        return amount * conversion_rate
    except TypeError:
        print(f"Error: Unable to convert amount '{amount}' with rate '{conversion_rate}'. Ensure the amount is numeric.")
        return amount

def extract_currency_and_amount(value):
    """Extracts currency symbol and amount from a combined string like 'INR 30000' or '$100'."""
    currency_match = re.match(r"([^\d\s]+)", value)
    amount_match = re.search(r"(\d+(\.\d+)?)", value)
   
    if currency_match and amount_match:
        currency = currency_match.group(0).strip()
        amount = float(amount_match.group(0))
        return currency, amount
    else:
        print(f"Warning: Could not extract currency and amount from '{value}'.")
        return None, None

def process_currency_info(df):
    # Extract currency and amount from the 'Currency' column
    df[['Currency', 'Amount']] = df['Currency'].apply(lambda x: pd.Series(extract_currency_and_amount(x)))
   
    while True:
        print("\nMenu:")
        print("1. Display basic info (count of each value in a particular column)")
        print("2. Convert currency symbol alone")
        print("3. Convert currency value (e.g., EUR to INR) and update the symbol")
        choice = input("Enter your choice (1/2/3) or 'q' to quit: ").strip().lower()

        if choice == 'q':
            break

        if choice == '1':
            # Display unique currency count
            currency_count = df['Currency'].nunique()
            print(f"\nThere are {currency_count} unique currencies in the dataset.")

            # Display the count of each currency type
            currency_type_count = df['Currency'].value_counts()
            print("\nCurrency counts:")
            print(currency_type_count)

        elif choice == '2':
            # Convert currency symbol alone
            old_currency = input("Enter the current currency symbol you want to change: ").strip().upper()
            if not is_valid_currency_symbol(old_currency):
                print("Error: Invalid currency symbol.")
                continue

            new_currency = input("Enter the new currency symbol: ").strip().upper()
            if not is_valid_currency_symbol(new_currency):
                print("Error: Invalid currency symbol.")
                continue

            df['Currency'] = df['Currency'].replace(old_currency, new_currency)
            print(f"\nUpdated currency symbol '{old_currency}' to '{new_currency}'.")

        elif choice == '3':
            # Convert currency value and update the symbol
            old_currency = input("Enter the current currency symbol you want to change: ").strip().upper()
            if not is_valid_currency_symbol(old_currency):
                print("Error: Invalid currency symbol.")
                continue

            new_currency = input("Enter the new currency symbol: ").strip().upper()
            if not is_valid_currency_symbol(new_currency):
                print("Error: Invalid currency symbol.")
                continue

            try:
                conversion_rate = float(input(f"Enter the conversion rate from {old_currency} to {new_currency}: ").strip())
            except ValueError:
                print("Error: Invalid conversion rate. Please enter a numeric value.")
                continue

            # Convert and aggregate the amount for the new currency
            df.loc[df['Currency'] == old_currency, 'Amount'] *= conversion_rate
            df['Currency'] = df['Currency'].replace(old_currency, new_currency)

            # Aggregate the amounts by currency
            df = df.groupby('Currency').agg({'Amount': 'sum'}).reset_index()
            print(f"\nConverted amounts from {old_currency} to {new_currency} and updated the currency symbol.")
            print("\nUpdated DataFrame:")
            print(df)

        else:
            print("Invalid choice. Please enter 1, 2, 3, or 'q' to quit.")
 # Return the updated DataFrame if needed
    return df




df = None
def clean_dataset():
    df = load_dataset()
    if df is None:
        print("Failed to load the dataset. Exiting the data cleaning process.")
        return

    backup_df = create_backup(df)
    print("Backup of the original dataset created.")
    
    while True:
        print("\nOptions:")
        print("1. Basic Operations")
        print("2. Column-wise Operations")
        print("3. Show DataFrame")
        print("4. Exit")

        choice = input("Choose an option: ").strip()
        
        if choice == '1':
            while True:
                print("\nBasic Operations:")
                print("1. Display basic info")
                print("2. Display head and tail of data")
                print("3. Display Summary Statistics")
                print("4. Check missing values")
                print("5. Check inconsistencies")
                print("6. Return to Main Menu")
                print("7. Exit")

                basic_choice = input("Choose an option: ").strip()

                if basic_choice == '1':
                    print(df)
                    display_basic_info(df)
                elif basic_choice == '2':  # Corrected from 'choice' to 'basic_choice'
                    display_choice = input("Would you like to see the first few rows (head), the last few rows (tail), or both? (head/tail/both): ").strip().lower()
                    show_head_and_tail(df, rows=5, display_choice=display_choice)
                elif basic_choice == '3':
                    generate_summary_statistics(df)
                elif basic_choice == '4':
                    check_missing_values(df)
                elif basic_choice == '5':
                    check_inconsistencies(df)
                elif basic_choice == '6':
                    break
                elif basic_choice == '7':
                    print("Exiting the data cleaning process.")
                    return
                else:
                    print("Invalid choice. Please select a valid option.")
        
        elif choice == '2':
            while True:
                print("\nColumn-wise Operations:")
                print("1. Remove columns")
                print("2. Drop duplicates")
                print("3. Drop missing rows")
                print("4. Handle missing values")
                print("5. Interpolate missing values")
                print("6. Detect and fix inaccurate data")
                print("7. Remove special characters and extra white spaces")
                print("8. Format phone numbers")
                print("9. Format names into first, middle, last name")
                print("10. Format dates")
                print("11. Format addresses")
                print("12. Standardize text data")
                print("13. Normalize numeric data")
                print("14. Renaming Columns Name")
                print("15. Concatenate Column ")
                print("16. Filter Condition For numeric data")
                print("17. Convert Data Types")
                print("18. Apply Aggregation Functions")
                print("19. Search in row by value")
                print("20. Sort column")
                print("21. Foramt Currency")
                print("21. Return to Main Menu")
                print("22. Exit")

                column_choice = input("Choose an option: ").strip()

                if column_choice == '1':
                    df = remove_columns_by_index(df)
                elif column_choice == '2':
                    df = check_and_handle_duplicates(df)
                elif column_choice == '3':
                    columns_to_check = input("Enter the column indices (comma-separated) to check for missing values: ").strip()
                    columns_to_check = [df.columns[int(index)] for index in columns_to_check.split(',')]
                    df = drop_missing_rows(df, columns_to_check)
                elif column_choice == '4':
                    df = handle_missing_values(df)
                elif column_choice == '5':
                    df = interpolate_missing_values(df)
                elif column_choice == '6':
                    column_index = input("Enter the column index to check for inaccuracies (or leave blank to check all columns): ").strip()
                    column_index = int(column_index) if column_index else None
                    df = detect_and_fix_inaccurate_data(df, column_index)
                elif column_choice == '7':
                    text_columns = show_text_columns(df)
                    if text_columns:
                        column_indices = list(map(int, input("Enter the column indices for text cleaning (comma-separated): ").split(',')))
                        df = clean_text_columns_by_index(df, column_indices)
                    else:
                        print("No text columns available for cleaning.")
                elif column_choice == '8':
                    phone_col_index = int(input("Enter the index of the phone number column: "))
                    df = process_phone_numbers(df, phone_col_index)
                elif column_choice == '9':
                    name_column_index = int(input("Enter the column index for names: ").strip())
                    df = format_names(df, name_column_index)
                elif column_choice == '10':
                    df = format_date_columns(df)
                elif column_choice == '11':
                    address_column_index = int(input("Enter the column index for addresses: ").strip())
                    df = process_addresses(df, address_column_index)
                elif column_choice == '12':
                    df = standardize_text_data(df, remove_special_chars=False)
                elif column_choice == '13':
                    df = normalize_numeric_data(df)
                elif column_choice == '14':
                    df = rename_columns(df)
                elif column_choice == '15':
                    df = user_combine_columns(df)
                elif column_choice == '16':
                    df = filter_dataset_from_file(df)
                elif column_choice == '17':    
                    df = convert_data_types(df, conversions={})
                elif column_choice == '18':    
                    group_by_column = input("Enter the column name to group by: ")
                    apply_aggregation(df, group_by_column)
                elif column_choice == '19':    
                    select_row_by_value(df)
                elif column_choice == '20':    
                    sorted_df = sort_column(df)
                elif column_choice == '21':    
                    updated_df = process_currency_info(df)
                elif column_choice == '22':
                    break
                elif column_choice == '23':
                    print("Exiting the data cleaning process.")
                    return
                else:
                    print("Invalid choice. Please select a valid option.")
                
                
        elif choice == '3':
            print("\nCurrent DataFrame:")
            print(df.head())
        
        elif choice == '4':
            print("Exiting the data cleaning process.")
            break
        
        else:
            print("Invalid choice. Please select a valid option.")

# Run the data cleaning process
clean_dataset()



Enter the file path of the dataset:  C:\\Users\\raxshana.k\\Downloads\\dataAnalystJobsIndia_7th_July_2024.csv


Dataset loaded successfully.
Backup of the original dataset created.

Options:
1. Basic Operations
2. Column-wise Operations
3. Show DataFrame
4. Exit


Choose an option:  1



Basic Operations:
1. Display basic info
2. Display head and tail of data
3. Check missing values
4. Check inconsistencies
5. Return to Main Menu
6. Exit


Choose an option:  1


      Unnamed: 0                                          job_title  \
0              0    JP Morgan Chase - Client Data Analyst (4-8 yrs)   
1              1                           Data Analyst - Bangalore   
2              2  Senior IT ATLAS Data Analyst and Integration S...   
3              3                            Consultant Data Analyst   
4              4           Senior Data Analyst - Retail Liabilities   
...          ...                                                ...   
1556        1556                    Data Analyst / Sr. Data Analyst   
1557        1557                  Artificial intelligence Architect   
1558        1558                                      Data Analysts   
1559        1559                                     Data Architect   
1560        1560                                       Data analyst   

                       company experience  min exp  max exp        salary  \
0              JP Morgan Chase    4-8 Yrs      4.0      8.0           

Choose an option:  2
Would you like to see the first few rows (head), the last few rows (tail), or both? (head/tail/both):  head



First 5 rows of the dataset:
   Unnamed: 0                                          job_title  \
0           0    JP Morgan Chase - Client Data Analyst (4-8 yrs)   
1           1                           Data Analyst - Bangalore   
2           2  Senior IT ATLAS Data Analyst and Integration S...   
3           3                            Consultant Data Analyst   
4           4           Senior Data Analyst - Retail Liabilities   

                    company experience  min exp  max exp        salary  \
0           JP Morgan Chase    4-8 Yrs      4.0      8.0           NaN   
1        Schneider Electric   5-10 Yrs      5.0     10.0  ₹ 8 - 16L/yr   
2  SAP Labs India Pvt. Ltd.   7-11 Yrs      7.0     11.0           NaN   
3                    Pfizer    2-6 Yrs      2.0      6.0           NaN   
4           IDFC FIRST Bank   5-10 Yrs      5.0     10.0           NaN   

   base salary  max salary                                location  \
0          NaN         NaN                Hyde