In [2]:
# 🤗 HuggingFace Datasets, Pandas 불러오기
from datasets import load_dataset
import pandas as pd

# IMDB 데이터셋 로드
dataset = load_dataset("imdb")

# 학습/테스트 셋을 Pandas로 변환
train_df = dataset["train"].to_pandas()
test_df = dataset["test"].to_pandas()

# 기본 구조 확인
print(train_df.head())
print(train_df["label"].value_counts())


                                                text  label
0  I rented I AM CURIOUS-YELLOW from my video sto...      0
1  "I Am Curious: Yellow" is a risible and preten...      0
2  If only to avoid making this type of film in t...      0
3  This film was probably inspired by Godard's Ma...      0
4  Oh, brother...after hearing about this ridicul...      0
label
0    12500
1    12500
Name: count, dtype: int64


In [3]:
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

nltk.download('stopwords')
from nltk.corpus import stopwords

# 정제 함수 정의
def clean_text(text):
    text = text.lower()  # 소문자화
    text = re.sub(r"<br />", " ", text)
    text = re.sub(r"[^a-zA-Z]", " ", text)  # 알파벳만 남기기
    text = re.sub(r"\s+", " ", text)  # 연속된 공백 제거
    return text.strip()

# 전처리 적용
train_df["clean_text"] = train_df["text"].apply(clean_text)

# 불용어 제거 + TF-IDF 벡터화
stop_words = stopwords.words('english')
vectorizer = TfidfVectorizer(max_features=5000, stop_words=stop_words)

X = vectorizer.fit_transform(train_df["clean_text"])
y = train_df["label"]

# 학습/검증 나누기
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"TF-IDF 벡터화 완료! 학습 샘플 수: {X_train.shape[0]}, 검증 샘플 수: {X_val.shape[0]}")


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/relaxman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


TF-IDF 벡터화 완료! 학습 샘플 수: 20000, 검증 샘플 수: 5000


In [5]:
import os
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "http://localhost:9000"
os.environ["AWS_ACCESS_KEY_ID"] = "minioadmin"
os.environ["AWS_SECRET_ACCESS_KEY"] = "minioadmin"

import mlflow
import mlflow.sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

mlflow.set_tracking_uri("http://localhost:5050")
mlflow.set_experiment("imdb-logreg-classification")

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
acc = accuracy_score(y_val, y_pred)

with mlflow.start_run():
    mlflow.log_param("model", "LogisticRegression")
    mlflow.log_param("max_iter", 1000)
    mlflow.log_metric("accuracy", acc)
    mlflow.sklearn.log_model(model, "model")

    print(f"✅ 모델 학습 완료! Accuracy: {acc:.4f}")




✅ 모델 학습 완료! Accuracy: 0.8844
🏃 View run classy-turtle-869 at: http://localhost:5050/#/experiments/2/runs/fdc088af57e8446a9592e32a647ba336
🧪 View experiment at: http://localhost:5050/#/experiments/2
