In [6]:
import pandas as pd
import numpy as np

df = pd.read_csv("dataset_normalized.csv")

print("="*60)
print("DATA CLEANING PROCESS")
print("="*60)
print(f"\nðŸ“Š Original dataset: {df.shape}")

df_before_dedup = len(df)
df = df.drop_duplicates()
duplicates_removed = df_before_dedup - len(df)
print(f"   Removed {duplicates_removed} duplicate rows")
print(f"   Current shape: {df.shape}")

X = df.drop('Role', axis=1)
y = df['Role']
std_per_row = X.std(axis=1)

realistic_mask = std_per_row >= 0.5
artificial_rows = (~realistic_mask).sum()

df_clean = df[realistic_mask].copy()

print(f"   Removed {artificial_rows} artificial rows (std < 0.5)")
print(f"   Current shape: {df_clean.shape}")

print("\n" + "="*60)
print("CLEANING SUMMARY")
print("="*60)
print(f"Original rows:        {df_before_dedup}")
print(f"After deduplication:  {len(df)} (-{duplicates_removed})")
print(f"After filtering:      {len(df_clean)} (-{artificial_rows})")
print(f"Total removed:        {df_before_dedup - len(df_clean)}")
print(f"Retention rate:       {len(df_clean)/df_before_dedup*100:.2f}%")

df_clean.to_csv("dataset_normalized_clean.csv", index=False)

print("\nâœ… Dataset cleaned and saved to dataset_normalized_clean.csv")

DATA CLEANING PROCESS

ðŸ“Š Original dataset: (9179, 28)
   Removed 2322 duplicate rows
   Current shape: (6857, 28)
   Removed 3 artificial rows (std < 0.5)
   Current shape: (6854, 28)

CLEANING SUMMARY
Original rows:        9179
After deduplication:  6857 (-2322)
After filtering:      6854 (-3)
Total removed:        2325
Retention rate:       74.67%

âœ… Dataset cleaned and saved to dataset_normalized_clean.csv
