In [3]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.ensemble import IsolationForest

# Load the original CSV file
csv_path = "/home/william/dataset/skin/SkinCAP/skincap_v240623.csv"
df = pd.read_csv(csv_path)

# Use only non-empty entries in the text column for analysis
text_column = "caption_zh_polish_en"
df_text = df[df[text_column].notna() & (df[text_column].str.strip() != "")].copy()
texts = df_text[text_column].tolist()

# Load the sentence embedding model and encode texts
print("Encoding sentence embeddings...")
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(texts, show_progress_bar=True)

# Run semantic outlier detection using IsolationForest
print("Detecting outliers with IsolationForest...")
clf = IsolationForest(contamination=0.1, random_state=42)
predictions = clf.fit_predict(embeddings)  # 1 = inlier, -1 = outlier

# Keep only the inlier (valid) entries
df_text["valid"] = predictions == 1
valid_indices = df_text[df_text["valid"]].index
df_filtered = df.loc[valid_indices].copy().reset_index(drop=True)

# Save the cleaned file with UTF-8 BOM to avoid encoding issues
save_path = "/home/william/dataset/skin/SkinCAP/skincap_v240623_cleaned.csv"
df_filtered.to_csv(save_path, index=False, encoding="utf-8-sig")

# Print filtering summary
total = len(df)
kept = len(df_filtered)
removed = total - kept

print(f"Cleaning complete. Removed {removed} unclear samples, kept {kept} entries.")
print(f"Cleaned file saved to: {save_path}")


Encoding sentence embeddings...


Batches:   0%|          | 0/125 [00:00<?, ?it/s]

Detecting outliers with IsolationForest...
Cleaning complete. Removed 400 unclear samples, kept 3600 entries.
Cleaned file saved to: /home/william/dataset/skin/SkinCAP/skincap_v240623_cleaned.csv
