<a href="https://colab.research.google.com/github/yangyadi/Case/blob/main/FlagOutliers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

1. Define Flag function

In [None]:
def flag_outliers_iqr(df, columns=None, threshold=1.5):
    """
    Flags outliers in specified numeric columns using the IQR method.
    Returns a dictionary with column names and row indices of outliers.

    Parameters:
        df (DataFrame): Data
        columns (list): List of column names to check. If None, checks all numeric columns.
        threshold (float): IQR multiplier, default 1.5
    """
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns.tolist()

    outlier_dict = {}

    for col in columns:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR

        outliers = df[(df[col] < lower_bound) | (df[col] > upper_bound)]

        if not outliers.empty:
            outlier_dict[col] = outliers.index.tolist()

    return outlier_dict


2. Flag outliers

In [None]:
outlier_indices = flag_outliers_iqr(df)

# Print how many outliers per column
for col, idxs in outlier_indices.items():
    print(f"{col}: {len(idxs)} outliers")


3. Remove outliers

In [None]:
# Combine all outlier indices into one set
all_outlier_rows = set(i for idxs in outlier_indices.values() for i in idxs)

# Drop those rows
df_cleaned = df.drop(index=all_outlier_rows)