In [34]:
#1. google drive 연동 및 필요 라이브러리 임포트
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_predict, KFold # Added KFold
from sklearn.metrics import classification_report

In [36]:
#2.데이터 불러오기
file_path = 'C:/flask_project/data/강원도_관광지_20_예시.xlsx'
df = pd.read_excel(file_path)
df_nature = df[['description', 'nature']].dropna()

In [38]:
#3. nature 칼럼을 리스트 형태로 변환(쉼표 분리)
df_nature['nature'] = df_nature['nature'].apply(lambda x: [tag.strip() for tag in str(x).split(',')])

In [40]:
#4. 입/출력 정의
X = df_nature['description']
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(df_nature['nature'])

In [42]:
#5. TF-IDF 벡터화 + 분류기 구성
vectorizer = TfidfVectorizer(max_features = 1000)
X_vec = vectorizer.fit_transform(X)
clf = OneVsRestClassifier(LogisticRegression(max_iter = 1000))

In [44]:
#6. K-Fold 교차검증을 통한 예측
kf = KFold(n_splits = 5, shuffle = True, random_state = 42)
y_pred = cross_val_predict(clf, X_vec, y, cv = kf)



In [46]:
#7. 성능 리포트 출력
print(classification_report(y, y_pred, target_names=mlb.classes_))

              precision    recall  f1-score   support

          계곡       0.00      0.00      0.00         1
          바다       0.00      0.00      0.00         6
           산       0.50      0.56      0.53         9
           섬       0.00      0.00      0.00         1
           숲       0.00      0.00      0.00         1
          자연       0.00      0.00      0.00         6
          호수       0.00      0.00      0.00         3

   micro avg       0.50      0.19      0.27        27
   macro avg       0.07      0.08      0.08        27
weighted avg       0.17      0.19      0.18        27
 samples avg       0.29      0.18      0.22        27



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [48]:
clf.fit(X_vec, y)

In [50]:
import joblib

# 분류기 저장
joblib.dump(clf, 'C:/flask_project/models/nature_model.pkl')

# 인코더 저장
joblib.dump(mlb, 'C:/flask_project/models/nature_encoder.pkl')

print("✅ nature 관련 모델, 인코더 저장 완료!")

✅ nature 관련 모델, 인코더 저장 완료!
