# General ML Model Pipeline

### Table of contents:
1. Dependencies  
2. Read-in data  
3. Data cleanup  
    3a. Check inconsistencies and determine data types  
    3b. Fix inconsistencies  
    3c. Convert and re-code feature data  
    (3d. Imputation)  
4. Group imbalance

### 1. Dependencies

In [29]:
import pandas as pd
import numpy as np

### 2. Read-in data
Read data and transform into dataframe.  
Test case: local data in .csv format.
Change read-in procedure depending on data source.

### 3. Data cleanup

### 3a. Check inconsistencies
Custom function to check dataframe for inconsistencies in, e.g., data type and delimiters.  


In [30]:
def checkInconsistencies(df: pd.DataFrame):

    # Checking for missing values
    print("Missing Values Check:")
    print(df.isna().sum())
    
    # Check for inconsistent data types
    print("\nData Types Check:")
    datatypes = df.dtypes
    print(datatypes)
    
    # Check for numeric columns containing non-numeric data
    print("\nNon-numeric Data in Numeric Columns:")
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        non_numeric = df[col].apply(lambda x: not pd.api.types.is_number(x))
        if non_numeric.sum() > 0:
            print(f"Non-numeric values found in {col}:")
            print(df[non_numeric][col])
    
    # Checking for delimiter issues by finding any cell containing multiple delimiters
    print("\nDelimiter Issues Check (commas, semicolons):")
    delimiter_issues = df.applymap(lambda x: isinstance(x, str) and (',' in x or ';' in x))
    if delimiter_issues.any().any():
        print("Delimiter issues found in the following columns:")
        print(df.columns[delimiter_issues.any()])

    # Checking for duplicate features
    print("\nDuplicate Columns Check:")
    duplicatecols = df[df.columns.duplicated()]
    if not duplicatecols.empty:
        print(f"{len(duplicatecols)} duplicate columns found:")
        print(duplicatecols)
    else:
        print("No duplicate columns found.")

    # Checking for duplicate rows
    print("\nDuplicate Rows Check:")
    duplicaterows = df[df.duplicated()]
    if not duplicaterows.empty:
        print(f"{len(duplicaterows)} duplicate rows found:")
        print(duplicaterows)
    else:
        print("No duplicate rows found.")
    
    # Checking for inconsistent casing in string columns
    print("\nInconsistent Casing in String Columns:")
    for col in df.select_dtypes(include=['object']).columns:
        inconsistent_case = df[col].apply(lambda x: isinstance(x, str) and (x != x.lower() and x != x.upper()))
        if inconsistent_case.any():
            print(f"Inconsistent casing found in {col}:")
            print(df[inconsistent_case][col])

    # Checking for extra whitespace in string columns
    print("\nExtra Whitespace in String Columns:")
    for col in df.select_dtypes(include=['object']).columns:
        whitespace_issues = df[col].apply(lambda x: isinstance(x, str) and (x != x.strip()))
        if whitespace_issues.any():
            print(f"Whitespace issues found in {col}:")
            print(df[whitespace_issues][col])

    # Return inconsistency boolean and index vectors
    return datatypes, duplicatecols, duplicaterows

### 3b. Fix inconsistencies
Custom function to fix inconsistencies in dataframe detected in step 3a.  
Uses custom function that fills missing cells with unused markers to make working with the dataframe easier and prep for future flexibility in missing value handling.

In [31]:
def fillNaNmarker(df: pd.DataFrame, marker=999):

    # check if marker is used anywhere in the dataset, fill NaNs with available recognizable marker
    filled = False
    while not filled:
        if not marker in df:
            df.fillna(marker, inplace=True)
            filled = True
        else:
            marker = marker * 10 + 9

    return marker


In [32]:
def fixInconsistencies(df: pd.DataFrame, handleNaN = 'impute'):

    # Converting non-numeric values in numeric columns to NaN and then filling or dropping
    print("Fixing Non-numeric Data in Numeric Columns...")
    for col in df.select_dtypes(include=['int64', 'float64']).columns:
        df[col] = pd.to_numeric(df[col], errors='coerce')  # Convert non-numeric to NaN

    # Handling missing values based on the handleNaN parameter
    match handleNaN:
        case 'impute':
            # Note that imputation will happen later, here just fills NaNs in-place with known marker
            marker = fillNaNmarker(df)
            print("Filled NaNs with known value %d" % marker)
        case 'drop':
            print("Dropping Rows with Missing Values...")
            df = df.dropna()  # Drop rows with missing values
        case 'mean':
            print("Filling Missing Values with column mean...")
            df = df.fillna(df.mean())  # Fill NaN values in numeric columns with column mean
        case 'last':
            print("Filling Missing Values with last known value...")
            df = df.fillna(method='ffill')  # Forward filling for missing values

    # Fixing delimiter issues by replacing problematic delimiters (commas, semicolons)
    print("Fixing Delimiter Issues...")
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: str(x).replace(',', '').replace(';', '') if isinstance(x, str) else x)

    # Dropping duplicate rcolumns
    print("Dropping Duplicate Columns...")
    df = df.loc[:,~df.columns.duplicated()].copy()

    # Standardizing casing in string columns (converting to lower case)
    print("Fixing Inconsistent Casing in String Columns...")
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: str(x).lower() if isinstance(x, str) else x)

    # Removing leading and trailing whitespaces from string columns
    print("Fixing Extra Whitespace in String Columns...")
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].apply(lambda x: str(x).strip() if isinstance(x, str) else x)

    print("All fixes applied.")
    return df
