In [1]:
import csv
import re
from pathlib import Path

# 입력 / 출력 경로
input_csv = Path("weighted_score_above_08_translated_patched.csv")   # 원본 파일
output_csv = Path("weighted_score_above_08_translated_cleaned.csv")  # 결과 파일

# 정규식
HEADING_TAG = re.compile(r"\[h[0-9]+\]")  # [h1], [h2], ...
def count_specials(s: str) -> int:
    return sum(1 for ch in s if not ch.isalnum() and not ch.isspace())

total, dropped = 0, 0

with open(input_csv, "r", encoding="utf-8", newline="") as f_in, \
     open(output_csv, "w", encoding="utf-8-sig", newline="") as f_out:
    
    reader = csv.DictReader(f_in)
    fieldnames = reader.fieldnames
    if "translated_en" not in fieldnames:
        raise KeyError("Column 'translated_en' not found in CSV.")
    
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        total += 1
        text = str(row.get("translated_en") or "")
        # 1) 특수문자 40개 이상이면 DROP
        if count_specials(text) >= 40:
            dropped += 1
            continue
        # 2) [h숫자] 토큰 제거
        cleaned_text = HEADING_TAG.sub("", text)
        row["translated_en"] = cleaned_text
        writer.writerow(row)

print(f"총 행 수: {total}")
print(f"삭제된 행 수(특수문자 ≥40): {dropped}")
print(f"최종 행 수: {total - dropped}")
print(f"저장 완료 → {output_csv}")


총 행 수: 498094
삭제된 행 수(특수문자 ≥40): 86568
최종 행 수: 411526
저장 완료 → weighted_score_above_08_translated_cleaned.csv


In [3]:
import csv, re, pandas as pd
from pathlib import Path

input_csv = Path("weighted_score_above_08_translated_cleaned.csv")

# 주요 비영문 스크립트 패턴 (한글, 일본어, 중국어, 키릴, 아랍, 데바나가리, 태국어)
NON_EN_PATTERNS = [
    re.compile(r"[\u3130-\u318F\uAC00-\uD7AF]"),  # Hangul
    re.compile(r"[\u3040-\u30FF]"),               # Japanese
    re.compile(r"[\u3400-\u9FFF]"),               # Chinese
    re.compile(r"[\u0400-\u04FF]"),               # Cyrillic
    re.compile(r"[\u0600-\u06FF]"),               # Arabic
    re.compile(r"[\u0900-\u097F]"),               # Devanagari
    re.compile(r"[\u0E00-\u0E7F]"),               # Thai
]

total, english_rows, nonenglish_rows = 0, 0, 0
nonenglish_samples = []

with open(input_csv, "r", encoding="utf-8", newline="") as f:
    reader = csv.DictReader(f)
    for idx, row in enumerate(reader):
        text = str(row.get("translated_en") or "")
        total += 1
        # 비영문 스크립트 탐지
        if any(p.search(text) for p in NON_EN_PATTERNS):
            nonenglish_rows += 1
            if len(nonenglish_samples) < 30:
                nonenglish_samples.append({"row_index": idx, "text": text[:200]})
        else:
            english_rows += 1

print(f"총 행 수: {total}")
print(f"영어로만 된 행: {english_rows}")
print(f"비영문(번역 안 된 듯한) 행: {nonenglish_rows}")
print("\n비영문 샘플 예시 (최대 30개):")
for s in nonenglish_samples:
    print(s)


총 행 수: 411526
영어로만 된 행: 401660
비영문(번역 안 된 듯한) 행: 9866

비영문 샘플 예시 (최대 30개):
{'row_index': 35, 'text': 'Here I will leave the cat, friends who pass by can pet it and give it a thumbs up \u3000\u3000\u3000 \u3000\u3000／＞\u3000\u3000フ \u3000\u3000\u3000 \u3000\u3000| \u3000_\u3000 _ l \u3000 \u3000\u3000 \u3000／` ミ＿xノ \u3000\u3000 \u3000 /\u3000\u3000\u3000 \u3000 | \u3000\u3000\u3000 /\u3000 ヽ\u3000\u3000 ﾉ \u3000 \u3000 │\u3000\u3000|\u3000|\u3000| \u3000／￣|\u3000\u3000 |\u3000|\u3000| \u3000| (￣ヽ＿_ヽ_)__) \u3000＼二つ'}
{'row_index': 50, 'text': 'Here I will leave the cat, friends who pass by can pet it and give it a thumbs up \u3000\u3000\u3000 \u3000\u3000／＞\u3000\u3000フ \u3000\u3000\u3000 \u3000\u3000| \u3000_\u3000 _ l \u3000 \u3000\u3000 \u3000／` ミ＿xノ \u3000\u3000 \u3000 /\u3000\u3000\u3000 \u3000 | \u3000\u3000\u3000 /\u3000 ヽ\u3000\u3000 ﾉ \u3000 \u3000 │\u3000\u3000|\u3000|\u3000| \u3000／￣|\u3000\u3000 |\u3000|\u3000| \u3000| (￣ヽ＿_ヽ_)__) \u3000＼二つ'}
{'row_index': 237, 'text': "funni

In [4]:
import csv, re
from pathlib import Path

input_csv = Path("weighted_score_above_08_translated_cleaned.csv")
output_csv = Path("weighted_score_above_08_translated_en_ko.csv")

# "삭제해야 하는 언어" 패턴만 지정 (중국어, 일본어, 키릴, 아랍, 데바나가리, 태국어)
REMOVE_PATTERNS = [
    re.compile(r"[\u3040-\u30FF]"),               # Japanese
    re.compile(r"[\u3400-\u9FFF]"),               # Chinese
    re.compile(r"[\u0400-\u04FF]"),               # Cyrillic
    re.compile(r"[\u0600-\u06FF]"),               # Arabic
    re.compile(r"[\u0900-\u097F]"),               # Devanagari
    re.compile(r"[\u0E00-\u0E7F]"),               # Thai
]

def is_other_language(text: str) -> bool:
    for p in REMOVE_PATTERNS:
        if p.search(text):
            return True
    return False

total, kept, dropped = 0, 0, 0

with open(input_csv, "r", encoding="utf-8", newline="") as f_in, \
     open(output_csv, "w", encoding="utf-8-sig", newline="") as f_out:
    
    reader = csv.DictReader(f_in)
    fieldnames = reader.fieldnames
    writer = csv.DictWriter(f_out, fieldnames=fieldnames)
    writer.writeheader()

    for row in reader:
        total += 1
        text = str(row.get("translated_en") or "")
        # 중국어/일본어/기타 언어가 있으면 DROP
        if is_other_language(text):
            dropped += 1
            continue
        kept += 1
        writer.writerow(row)

print(f"총 행 수: {total}")
print(f"삭제된 행 수 (중국어/일본어/기타 언어): {dropped}")
print(f"최종 행 수 (영어+한국어만 유지): {kept}")
print(f"저장 완료 → {output_csv}")


총 행 수: 411526
삭제된 행 수 (중국어/일본어/기타 언어): 884
최종 행 수 (영어+한국어만 유지): 410642
저장 완료 → weighted_score_above_08_translated_en_ko.csv


In [9]:
import csv, re
from pathlib import Path
import pandas as pd

input_csv = Path("weighted_score_above_08_translated_en_ko.csv")

# 한국어 + 영어만 허용 → 그 외 언어 검출 패턴
OTHER_PATTERNS = {
    "Japanese": re.compile(r"[\u3040-\u30FF]"),
    "Chinese": re.compile(r"[\u3400-\u9FFF]"),
    "Cyrillic": re.compile(r"[\u0400-\u04FF]"),
    "Arabic": re.compile(r"[\u0600-\u06FF]"),
    "Devanagari": re.compile(r"[\u0900-\u097F]"),
    "Thai": re.compile(r"[\u0E00-\u0E7F]"),
}

total, clean, flagged = 0, 0, 0
flagged_samples = []

with open(input_csv, "r", encoding="utf-8", newline="") as f:
    reader = csv.DictReader(f)
    for idx, row in enumerate(reader):
        text = str(row.get("translated_en") or "")
        total += 1
        bad_langs = [name for name, pat in OTHER_PATTERNS.items() if pat.search(text)]
        if bad_langs:
            flagged += 1
            if len(flagged_samples) < 3000:
                flagged_samples.append({
                    "row_index": idx,
                    "bad_langs": ",".join(bad_langs),
                    "preview_200": text[:200].replace("\n","\\n")
                })
        else:
            clean += 1

print(f"총 행 수: {total}")
print(f"영어/한국어만 포함된 행: {clean}")
print(f"기타 언어가 여전히 포함된 행: {flagged}")

print("\n[샘플 3000개 - 기타 언어 검출됨]")
df = pd.DataFrame(flagged_samples)
print(df)


총 행 수: 410642
영어/한국어만 포함된 행: 410642
기타 언어가 여전히 포함된 행: 0

[샘플 3000개 - 기타 언어 검출됨]
Empty DataFrame
Columns: []
Index: []


In [None]:
# === Hard-clean pipeline: remove [hN] tokens, then DROP ROWS with specials>=40 in ANY of target columns ===
import re, csv
from pathlib import Path
import pandas as pd
from tqdm.auto import tqdm

# ----------------- 설정 -----------------
INPUT_CSV  = "weighted_score_above_08_translated_en_ko.csv"   # 검사/정리할 CSV
OUTPUT_CSV = "weighted_score_above_08_hardclean.csv"          # 결과 (행 단위 삭제 반영)
REPORT_DIR = "hardclean_report"                                # 리포트 폴더
CHUNK_SIZE = 50_000
SPECIALS_THRESHOLD = 40
# ---------------------------------------

Path(REPORT_DIR).mkdir(exist_ok=True, parents=True)

# 1) 토큰 제거용 정규식: [h1], [ h2 ], [/h3], 전각대괄호 ［h4］, (h5) 등 변형까지 처리
H_TOKEN = re.compile(
    r"[\[\(\uFF3B\u3010\uFF08]\s*/?\s*h\s*\d+\s*[\]\)\uFF3D\u3011\uFF09]",
    re.IGNORECASE
)

def strip_h_tokens(s: str) -> str:
    if not s: return ""
    s = s.replace("\u3000", " ")      # 전각 공백 → 보통 공백
    s = s.replace("\u200b", "")       # zero-width space 제거
    return H_TOKEN.sub("", s)

def specials_count(s: str) -> int:
    # 영숫자/공백 이외 문자의 개수
    return sum(1 for ch in (s or "") if not ch.isalnum() and not ch.isspace())

# 어떤 원문 컬럼이 있는지 감지
hdr = pd.read_csv(INPUT_CSV, nrows=0).columns.tolist()
SOURCE_COLS = [c for c in ["review","translated_src","original","content","body"] if c in hdr]
TARGET_COLS = ["translated_en"] + SOURCE_COLS
print(f"[INFO] 검사 대상 컬럼: {TARGET_COLS}")

# 집계 변수
pre_total = 0
pre_ge40 = {c: 0 for c in TARGET_COLS}
pre_h     = {c: 0 for c in TARGET_COLS}

post_total = 0
dropped_rows = 0
post_ge40 = {c: 0 for c in TARGET_COLS}
post_h     = {c: 0 for c in TARGET_COLS}

# 샘플 저장(최대 1000개)
samples_violation = []
samples_h_tokens  = []

# 출력 파일 헤더 준비(입력과 동일)
if Path(OUTPUT_CSV).exists():
    Path(OUTPUT_CSV).unlink()
with open(INPUT_CSV, "r", encoding="utf-8", newline="") as fi, \
     open(OUTPUT_CSV, "w", encoding="utf-8-sig", newline="") as fo:
    rdr = csv.reader(fi)
    wtr = csv.writer(fo)
    header = next(rdr)
    wtr.writerow(header)

# 1패스: 통계 + h토큰 제거 + 행삭제 적용하여 스트리밍 저장
pbar = tqdm(desc="hard-cleaning", unit="rows")
for chunk in pd.read_csv(INPUT_CSV, chunksize=CHUNK_SIZE, dtype=str, encoding="utf-8", on_bad_lines="skip"):
    pre_total += len(chunk)

    # H 토큰 제거 (대상 컬럼들만)
    for col in TARGET_COLS:
        if col in chunk.columns:
            chunk[col] = chunk[col].fillna("").astype(str).map(strip_h_tokens)

    # pre 통계(삭제 전 기준: strip 후 specials 계산)
    for col in TARGET_COLS:
        if col in chunk.columns:
            sc = chunk[col].map(specials_count)
            pre_ge40[col] += int((sc >= SPECIALS_THRESHOLD).sum())
            pre_h[col]    += int(chunk[col].str.contains(H_TOKEN).sum())

    # 드롭 마스크: 대상 컬럼 중 하나라도 specials>=40 이면 DROP
    drop_mask = pd.Series(False, index=chunk.index)
    for col in TARGET_COLS:
        if col in chunk.columns:
            drop_mask |= chunk[col].map(lambda x: specials_count(x) >= SPECIALS_THRESHOLD)

    # 샘플 수집
    viol_idx = chunk.index[drop_mask]
    if len(viol_idx) and len(samples_violation) < 1000:
        take = viol_idx[:max(0, 1000 - len(samples_violation))]
        for i in take:
            row = {"__row_index": int(i)}
            for col in TARGET_COLS:
                if col in chunk.columns:
                    row[f"{col}__specials"] = specials_count(chunk.at[i, col])
                    row[f"{col}__preview200"] = (chunk.at[i, col] or "")[:200].replace("\n","\\n")
            samples_violation.append(row)

    # 삭제 적용
    kept = chunk.loc[~drop_mask].copy()
    dropped_rows += int(drop_mask.sum())
    post_total += len(kept)

    # post 통계(남은 데이터에 대해)
    for col in TARGET_COLS:
        if col in kept.columns:
            sc2 = kept[col].map(specials_count)
            post_ge40[col] += int((sc2 >= SPECIALS_THRESHOLD).sum())
            post_h[col]    += int(kept[col].str.contains(H_TOKEN).sum())

    # append 저장
    kept.to_csv(OUTPUT_CSV, mode="a", index=False, header=False, encoding="utf-8-sig")

    pbar.update(len(chunk))
pbar.close()

# 리포트 생성
summary = {
    "rows_before": pre_total,
    "rows_dropped(specials>=40 in ANY target col)": dropped_rows,
    "rows_after": post_total,
    "per_column_pre_specials_ge40": pre_ge40,
    "per_column_post_specials_ge40": post_ge40,
    "per_column_pre_h_tokens": pre_h,
    "per_column_post_h_tokens": post_h,
    "threshold": SPECIALS_THRESHOLD,
    "checked_columns": TARGET_COLS,
    "output_csv": OUTPUT_CSV,
}
pd.DataFrame([summary]).to_json(f"{REPORT_DIR}/hardclean_summary.json", orient="records", force_ascii=False)
pd.DataFrame(samples_violation).to_csv(f"{REPORT_DIR}/violations_samples.csv", index=False, encoding="utf-8-sig")

print("\n=== SUMMARY ===")
print(summary)
print(f"\n샘플 저장: {REPORT_DIR}/violations_samples.csv")
print(f"결과 저장: {OUTPUT_CSV}")


[INFO] 검사 대상 컬럼: ['translated_en', 'review']


KO→EN fixing:   0%|          | 0/410655 [45:04<?, ?it/s]
hard-cleaning: 410642rows [00:51, 7956.67rows/s]


=== SUMMARY ===
{'rows_before': 410642, 'rows_dropped(specials>=40 in ANY target col)': 60323, 'rows_after': 350319, 'per_column_pre_specials_ge40': {'translated_en': 0, 'review': 60323}, 'per_column_post_specials_ge40': {'translated_en': 0, 'review': 0}, 'per_column_pre_h_tokens': {'translated_en': 0, 'review': 0}, 'per_column_post_h_tokens': {'translated_en': 0, 'review': 0}, 'threshold': 40, 'checked_columns': ['translated_en', 'review'], 'output_csv': 'weighted_score_above_08_hardclean.csv'}

샘플 저장: hardclean_report/violations_samples.csv
결과 저장: weighted_score_above_08_hardclean.csv





: 