In [4]:
import pandas as pd
from google.colab import files

# Load the raw data
df = pd.read_csv("data.csv")

# 1. Drop exact duplicate rows
df = df.drop_duplicates()

# 2. Drop duplicate complaints based on 'File No.'
df = df.drop_duplicates(subset=['File No.'], keep='first')

# 3. Group rare categories in categorical columns under 'Other'
def group_rare_categories(df, threshold=0.02):
    """
    Replaces rare categories (below threshold) in object columns with 'Other'.
    """
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        freq = df[col].value_counts(normalize=True)
        rare = freq[freq < threshold].index
        df[col] = df[col].replace(rare, 'Other')
    return df

df = group_rare_categories(df)

# 4. Convert 'Recovery' to numeric if it's not already
df['Recovery'] = pd.to_numeric(df['Recovery'], errors='coerce')

# 5. Drop rows that are completely empty (if any)
df = df.dropna(how='all')

# 6. Save the cleaned DataFrame
df.to_csv("final_cleaned.csv", index=False)
print("✅ Cleaned CSV saved as 'final_cleaned.csv'")
files.download("final_cleaned.csv")

✅ Cleaned CSV saved as 'final_cleaned.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>