In [None]:
# -----------------------------------------
# 1. Library Import
# -----------------------------------------
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, classification_report
from scipy.sparse import hstack
import joblib
import matplotlib.pyplot as plt
import seaborn as sns

plt.rcParams['font.family'] = 'AppleGothic'
plt.rcParams['axes.unicode_minus'] = False


# -----------------------------------------
# 2. Load Data (경로 수정됨)
# -----------------------------------------
df = pd.read_csv("data/schedule_data.csv")
df.head()


# -----------------------------------------
# 3. Feature / Label Split
# -----------------------------------------
X_text = df["text"]
X_num = df[["days_left", "contains_keyword"]]

y_category = df["category"]
y_priority = df["priority"]


# -----------------------------------------
# 4. Train / Validation / Test Split
# -----------------------------------------
X_text_train, X_text_temp, X_num_train, X_num_temp, y_cat_train, y_cat_temp, y_pri_train, y_pri_temp = train_test_split(
    X_text, X_num, y_category, y_priority,
    test_size=0.30, stratify=y_category, random_state=42
)

X_text_val, X_text_test, X_num_val, X_num_test, y_cat_val, y_cat_test, y_pri_val, y_pri_test = train_test_split(
    X_text_temp, X_num_temp, y_cat_temp, y_pri_temp,
    test_size=0.50, stratify=y_cat_temp, random_state=42
)


# -----------------------------------------
# 5. TF-IDF Vectorization + Scaling
# -----------------------------------------
tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
tfidf.fit(X_text_train)

X_train_tfidf = tfidf.transform(X_text_train)
X_val_tfidf   = tfidf.transform(X_text_val)
X_test_tfidf  = tfidf.transform(X_text_test)

scaler = StandardScaler()
scaler.fit(X_num_train)

X_train_num = scaler.transform(X_num_train)
X_val_num   = scaler.transform(X_num_val)
X_test_num  = scaler.transform(X_num_test)

X_train = hstack((X_train_tfidf, X_train_num))
X_val   = hstack((X_val_tfidf, X_val_num))
X_test  = hstack((X_test_tfidf, X_test_num))


# -----------------------------------------
# 6. Model Training (Logistic Regression)
# -----------------------------------------
cat_model = LogisticRegression(max_iter=2000, class_weight="balanced")
cat_model.fit(X_train, y_cat_train)

pri_model = LogisticRegression(max_iter=2000, class_weight="balanced")
pri_model.fit(X_train, y_pri_train)


# -----------------------------------------
# 7. Validation Performance
# -----------------------------------------
def evaluate(model, X, y_true):
    pred = model.predict(X)
    print(classification_report(y_true, pred))
    return f1_score(y_true, pred, average="macro")

print("=== Category Model Validation ===")
evaluate(cat_model, X_val, y_cat_val)

print("\n=== Priority Model Validation ===")
evaluate(pri_model, X_val, y_pri_val)


# -----------------------------------------
# 8. Save Models & Preprocessors
# -----------------------------------------
joblib.dump(cat_model, "category_model.pkl")
joblib.dump(pri_model, "priority_model.pkl")
joblib.dump(tfidf, "tfidf_vectorizer.pkl")
joblib.dump(scaler, "feature_scaler.pkl")

print("\nModel files saved successfully.")

=== Category Model Validation ===
              precision    recall  f1-score   support

          과제       0.00      0.00      0.00         2
          기타       0.50      1.00      0.67         2
          발표       1.00      1.00      1.00         1
          시험       1.00      1.00      1.00         2

    accuracy                           0.71         7
   macro avg       0.62      0.75      0.67         7
weighted avg       0.57      0.71      0.62         7


=== Priority Model Validation ===
              precision    recall  f1-score   support

        High       0.50      1.00      0.67         1
         Low       0.75      1.00      0.86         3
      Medium       1.00      0.33      0.50         3

    accuracy                           0.71         7
   macro avg       0.75      0.78      0.67         7
weighted avg       0.82      0.71      0.68         7


Model files saved successfully.


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
