In [1]:
import os
import re
import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, f1_score

CSV_PATH = "d:/data/review/movie_reviews_train.csv"
TEXT_COL = "review"
LABEL_COL = "label"
TEST_SIZE = 0.2
MODEL_OUT = "d:/data/review/sentiment_model.joblib"
CHAR_NGRAM = (3, 5)
MAX_FEATURES = 200_000

def read_csv_smar(path: str) -> pd.DataFrame:
    try :
        return pd.read_csv(path)
    except UnicodeDecodeError:
        return pd.read_csv(path, encoding="cp949")

def normalize_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.strip()
    s = re.sub(r"(.)\1{2,}", r"\1\1", s)
    s = re.sub(r"\s+", " ", s)
    return s

df = read_csv_smar(CSV_PATH)
assert TEXT_COL in df.columns and LABEL_COL in df.columns, f"컬럼을 찾을 수 없습니다: {df.columns.tolist()}"

X = df[TEXT_COL].fillna("").astype(str).map(normalize_text)
y = (
    df[LABEL_COL]
    .astype(str).str.lower().str.strip()
    .replace({"neg": "0", "pos": "1"})
    .astype(int).clip(0, 1)
)

print(f"총 샘플 수: {len(df)}")

X_tr, X_te, y_tr, y_te = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=42, stratify=y
)

pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        analyzer="char",
        ngram_range=CHAR_NGRAM,
        max_features=MAX_FEATURES,
        lowercase=False,
        min_df=3
    )),
    ("clf", LogisticRegression(
        max_iter=1000,
        class_weight="balanced",
        C=2.0
    )),
])

pipe.fit(X_tr, y_tr)

y_hat = pipe.predict(X_te)
acc = accuracy_score(y_te, y_hat)
f1w = f1_score(y_te, y_hat, average="weighted")
print("\n=== Evaluation ===")
print(f"Accuracy       : {acc:.4f}")
print(f"F1 (weighted)  : {f1w:.4f}")
print(classification_report(y_te, y_hat, digits=4))

out_dir = os.path.dirname(MODEL_OUT)
if out_dir:
    os.makedirs(out_dir, exist_ok=True)
joblib.dump(pipe, MODEL_OUT)
print(f"\n모델 저장: {MODEL_OUT}")

sample_texts = [
    "연기가 좋아서 몰입감 최고였습니다. 또 보고 싶어요!",
    "지루하고 시간 아깝네요. 다시 보고 싶지 않아요.",
]
probas = pipe.predict_proba(sample_texts)
for t, p in zip(sample_texts, probas) :
    neg, pos = float(p[0]), float(p[1])
    label = "POS" if pos >= 0.5 else "NEG"
    print({"text": t, "label": label, "score": round(pos, 4)})

총 샘플 수: 1000

=== Evaluation ===
Accuracy       : 1.0000
F1 (weighted)  : 1.0000
              precision    recall  f1-score   support

           0     1.0000    1.0000    1.0000       100
           1     1.0000    1.0000    1.0000       100

    accuracy                         1.0000       200
   macro avg     1.0000    1.0000    1.0000       200
weighted avg     1.0000    1.0000    1.0000       200


모델 저장: d:/data/review/sentiment_model.joblib
{'text': '연기가 좋아서 몰입감 최고였습니다. 또 보고 싶어요!', 'label': 'POS', 'score': 0.5513}
{'text': '지루하고 시간 아깝네요. 다시 보고 싶지 않아요.', 'label': 'NEG', 'score': 0.2797}
