In [1]:
!pip install lightgbm



## 2차

In [11]:
import pandas as pd, numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight
from lightgbm import LGBMClassifier, early_stopping, log_evaluation
from tqdm.auto import tqdm

# 1) 데이터 로드 -----------------------------------------------
train = pd.read_csv('dataset/train.csv', encoding='utf-8-sig')
val   = pd.read_csv('dataset/val.csv',   encoding='utf-8-sig')
test  = pd.read_csv('dataset/test.csv',  encoding='utf-8-sig')

# 2) 상담 결과(target) 인코딩 -----------------------------------
# train/val/test 전체의 레이블을 모아 한 번에 fit
all_targets = pd.concat([
    train['result_label'],
    val  ['result_label'],
    test ['result_label']
]).astype(str)
le_result = LabelEncoder().fit(all_targets)

# 각 DataFrame 에 숫자형 레이블 컬럼 추가
for df in (train, val, test):
    df['label_id'] = le_result.transform(df['result_label'].astype(str))

# 3) 기타 범주형 피처 인코딩 ------------------------------------
categorical_cols = ['sent_label', 'mid_category', 'content_category', 'rec_place']
le_cat = {}
for col in tqdm(categorical_cols, desc='Fitting encoders'):
    # train/val/test 전체 unique 값을 fit
    all_vals = pd.concat([train[col], val[col], test[col]]).astype(str)
    le = LabelEncoder().fit(all_vals)
    le_cat[col] = le
    # transform
    for df in (train, val, test):
        df[f'{col}_id'] = le.transform(df[col].astype(str))

# 4) 피처 컬럼 구성 ---------------------------------------------
# 모델에 사용하지 않을 컬럼(문자열, 리스트, ID 등) 정의
drop_cols = (
    ['session_id', 'result_label', 'label_id', 'asr_segments', 'top_nouns']
    + categorical_cols
)
feature_cols = [c for c in train.columns if c not in drop_cols]

# 5) 학습/검증/테스트 세트 분리 -----------------------------------
X_train, y_train = train[feature_cols], train['label_id']
X_val,   y_val   = val  [feature_cols], val  ['label_id']
X_test,  y_test  = test [feature_cols], test ['label_id']

# 6) 클래스 가중치 계산
classes = np.unique(y_train)
weights = compute_class_weight('balanced', classes=classes, y=y_train)
class_weight = dict(zip(classes, weights))

# 7) LightGBM 학습 (가중치 적용)
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le_result.classes_),
    n_estimators=200,
    learning_rate=0.05,
    class_weight=class_weight,    # ← 여기
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[early_stopping(stopping_rounds=10), log_evaluation(period=10)]
)

# 8) 테스트 세트 예측 및 평가 ------------------------------------
print("\n▶ 테스트 세트 예측 중...")
y_pred_test = []
for chunk in tqdm(np.array_split(X_test, 10), desc='LGBM Predict (test)'):
    proba = model.predict_proba(chunk)
    y_pred_test.extend(np.argmax(proba, axis=1))

print("\n=== LightGBM Test ===")
print("Accuracy:", accuracy_score(y_test, y_pred_test))
print(classification_report(y_test, y_pred_test, target_names=le_result.classes_))

Fitting encoders:   0%|          | 0/4 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000758 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3285
[LightGBM] [Info] Number of data points in the train set: 2206, number of used features: 20
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.386294
Training until validation scores don't improve for 10 rounds
[10]	training's multi_logloss: 0.874292	valid_1's multi_logloss: 1.02676
[20]	training's multi_logloss: 0.598673	valid_1's multi_logloss: 0.867784
[30]	training's multi_logloss: 0.429721	valid_1's multi_logloss: 0.781753
[40]	training's multi_logloss: 0.321088	valid_1's multi_logloss: 0.726497
[50]	training's multi_logloss: 0.246436	valid_1's multi_logloss: 0.680916
[60]	training's multi_logloss: 0.192174	valid_1's multi_logloss: 0

  return bound(*args, **kwds)


LGBM Predict (test):   0%|          | 0/10 [00:00<?, ?it/s]


=== LightGBM Test ===
Accuracy: 0.7932489451476793
              precision    recall  f1-score   support

          만족       0.89      0.89      0.89       393
          미흡       0.22      0.11      0.14        19
    추가 상담 필요       0.34      0.45      0.39        53
       해결 불가       0.00      0.00      0.00         9

    accuracy                           0.79       474
   macro avg       0.36      0.36      0.36       474
weighted avg       0.79      0.79      0.79       474



## 3차 텍스트 임베딩 추가

In [12]:
!pip install transformers torch

Collecting sympy==1.13.1 (from torch)
  Downloading sympy-1.13.1-py3-none-any.whl.metadata (12 kB)
Downloading sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.2/6.2 MB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
Installing collected packages: sympy
  Attempting uninstall: sympy
    Found existing installation: sympy 1.13.3
    Uninstalling sympy-1.13.3:
      Successfully uninstalled sympy-1.13.3
Successfully installed sympy-1.13.1


In [13]:
# -*- coding: utf-8 -*-
import os
import glob
import json
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# 형태소 분석기
from konlpy.tag import Okt

# TF–IDF
from sklearn.feature_extraction.text import TfidfVectorizer

# 인코딩 / 분할
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# LightGBM
from lightgbm import LGBMClassifier, early_stopping, log_evaluation

# 평가
from sklearn.metrics import classification_report, accuracy_score

# -----------------------------------------------------------------------------
# 1) JSON 파일 순회 → 상담 원문 + 주제 메타 DataFrame 생성
# -----------------------------------------------------------------------------
FINAL_DIR = 'json_merge/integration_data_v3'  # 실제 폴더명으로 수정
rows = []
for fp in tqdm(glob.glob(os.path.join(FINAL_DIR, '*.json')), desc='1. JSON → DataFrame'):
    with open(fp, 'r', encoding='utf-8') as f:
        j = json.load(f)
    sid = str(j.get('session_id') or os.path.splitext(os.path.basename(fp))[0])

    # 두 가지 포맷 모두 처리
    if 'classification' in j:
        meta     = j['classification']
        content  = meta.get('consulting_content', '')
        category = meta.get('consulting_category', '')
    else:
        content  = j.get('consulting_content', '')
        category = j.get('consulting_category', '')

    rows.append({
        'session_id': sid,
        'consulting_content': content,
        'consulting_category': category
    })
df_meta = pd.DataFrame(rows)


# -----------------------------------------------------------------------------
# 2) 기존 텍스트 특성 CSV 불러오기
# -----------------------------------------------------------------------------
df_feats = pd.read_csv(
    'column_extraction/text_features_all_v3.csv',  # 실제 파일명으로 수정
    dtype={'session_id': str},
    encoding='utf-8-sig'
)


# -----------------------------------------------------------------------------
# 3) 레이블 CSV 불러오기 (session_id, result_label)
# -----------------------------------------------------------------------------
df_labels = pd.read_csv(
    'column_extraction/preprocessing/session_labels.csv',  # 실제 파일명으로 수정
    dtype={'session_id': str},
    encoding='utf-8-sig'
)


# -----------------------------------------------------------------------------
# 4) 세 DataFrame을 session_id 기준으로 병합
# -----------------------------------------------------------------------------
df = (
    df_feats
      .merge(df_meta,   on='session_id', how='left')
      .merge(df_labels, on='session_id', how='inner')
)


# -----------------------------------------------------------------------------
# 5) 훈련/검증/테스트 데이터 분할 (stratify 유지)
# -----------------------------------------------------------------------------
train_val, test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['result_label'],
    random_state=42
)
train, val = train_test_split(
    train_val,
    test_size=0.25,
    stratify=train_val['result_label'],
    random_state=42
)


# -----------------------------------------------------------------------------
# 6) result_label → label_id로 인코딩
# -----------------------------------------------------------------------------
le = LabelEncoder().fit(train['result_label'])
for d in (train, val, test):
    d['label_id'] = le.transform(d['result_label'])


# -----------------------------------------------------------------------------
# 7) TF–IDF 벡터화 (형태소 명사만 사용) with tqdm
# -----------------------------------------------------------------------------
okt = Okt()
def noun_tokenizer(text):
    return okt.nouns(text)

tfidf = TfidfVectorizer(
    tokenizer=noun_tokenizer,
    max_features=5000,
    ngram_range=(1,2),
    min_df=5
)

X_tfidf_train = tfidf.fit_transform(
    tqdm(train['consulting_content'], desc='2. TFIDF fit')
)
X_tfidf_val = tfidf.transform(
    tqdm(val['consulting_content'], desc='3. TFIDF val')
)
X_tfidf_test = tfidf.transform(
    tqdm(test['consulting_content'], desc='4. TFIDF test')
)


# -----------------------------------------------------------------------------
# 8) 메타 피처 준비 (categorical → 숫자 ID)
# -----------------------------------------------------------------------------
# 8-1) 상담 주제 ID 인코딩
le_cat = LabelEncoder().fit(df['consulting_category'].astype(str))
for d in (train, val, test):
    d['consulting_category_id'] = le_cat.transform(d['consulting_category'].astype(str))

# 8-2) 실제로 존재하는 메타컬럼만 골라내기
desired_meta_cols = [
    'speech_count',
    # (기존 emo_* 칼럼이 없으면 skip)
    'emo_1_star_score','emo_2_star_score','emo_3_star_score',
    'emo_4_star_score','emo_5_star_score','sent_score',
    # 고객 prefix 감정
    *[f'고객_emo_{i}_star_score' for i in range(1,6)],
    '고객_sent_score',
    # 상담사 prefix 감정
    *[f'상담사_emo_{i}_star_score' for i in range(1,6)],
    '상담사_sent_score',
    # 분류 ID
    'consulting_category_id'
]

meta_cols = [c for c in desired_meta_cols if c in train.columns]
print("→ 사용할 메타 피처:", meta_cols)

X_meta_train = train[meta_cols].values
X_meta_val   =   val[meta_cols].values
X_meta_test  =  test[meta_cols].values


# -----------------------------------------------------------------------------
# 9) 최종 feature & label 준비 (TFIDF + 메타)
# -----------------------------------------------------------------------------
X_train, y_train = (
    np.hstack([X_tfidf_train.toarray(), X_meta_train]),
    train['label_id']
)
X_val, y_val = (
    np.hstack([X_tfidf_val.toarray(), X_meta_val]),
    val['label_id']
)
X_test, y_test = (
    np.hstack([X_tfidf_test.toarray(), X_meta_test]),
    test['label_id']
)


# -----------------------------------------------------------------------------
# 10) LightGBM 베이스라인 학습 (early stopping & 로그)
# -----------------------------------------------------------------------------
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    n_estimators=200,
    learning_rate=0.05,
    random_state=42
)

print("▶ 10. LightGBM 학습 중…")
model.fit(
    X_train, y_train,
    eval_set=[(X_train,y_train),(X_val,y_val)],
    eval_metric='multi_logloss',
    callbacks=[
        early_stopping(stopping_rounds=10),
        log_evaluation(period=20)
    ]
)


# -----------------------------------------------------------------------------
# 11) 검증·테스트 세트 평가
# -----------------------------------------------------------------------------
print("\n--- 검증 세트 평가 ---")
pred_val = model.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, pred_val):.4f}")
print(classification_report(y_val, pred_val, target_names=le.classes_))

print("\n--- 테스트 세트 평가 ---")
pred_test = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred_test):.4f}")
print(classification_report(y_test, pred_test, target_names=le.classes_))

1. JSON → DataFrame:   0%|          | 0/3533 [00:00<?, ?it/s]

2. TFIDF fit:   0%|          | 0/2119 [00:00<?, ?it/s]



3. TFIDF val:   0%|          | 0/707 [00:00<?, ?it/s]

4. TFIDF test:   0%|          | 0/707 [00:00<?, ?it/s]

→ 사용할 메타 피처: ['speech_count', '고객_emo_1_star_score', '고객_emo_2_star_score', '고객_emo_3_star_score', '고객_emo_4_star_score', '고객_emo_5_star_score', '고객_sent_score', '상담사_emo_1_star_score', '상담사_emo_2_star_score', '상담사_emo_3_star_score', '상담사_emo_4_star_score', '상담사_emo_5_star_score', '상담사_sent_score', 'consulting_category_id']
▶ 10. LightGBM 학습 중…
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032928 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 143226
[LightGBM] [Info] Number of data points in the train set: 2119, number of used features: 4136
[LightGBM] [Info] Start training from score -0.189045
[LightGBM] [Info] Start training from score -3.147840
[LightGBM] [Info] Start training from score -2.203378
[LightGBM] [Info] Start training from score -3.969820
Training until validation scores don't improve for 10 rounds
[20]	training's multi_logloss: 0.201046	valid_1's multi_logloss: 0.477044
[40]	train

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# V2 학습 코드

In [1]:
# -*- coding: utf-8 -*-
import os, glob, json
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

# 1) 형태소 명사 추출용
from konlpy.tag import Okt

# 2) TF–IDF & 모델링
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
import lightgbm as lgb
from sklearn.metrics import classification_report, accuracy_score

# -----------------------------------------------------------------------------
# 1) JSON 파일 순회 → 상담 원문 & 주제 메타 DataFrame 생성
# -----------------------------------------------------------------------------
FINAL_DIR = 'combination_data'  # 실제 폴더명으로 수정
rows = []
json_files = glob.glob(os.path.join(FINAL_DIR, '*.json'))

for fp in tqdm(json_files, desc='1. JSON → DataFrame'):
    with open(fp, 'r', encoding='utf-8') as f:
        j = json.load(f)
    sid = str(j.get('session_id') or os.path.splitext(os.path.basename(fp))[0])

    # 두 가지 포맷 모두 처리
    if 'classification' in j:
        meta     = j['classification']
        content  = meta.get('consulting_content', '')
        category = meta.get('consulting_category', '')
    else:
        content  = j.get('consulting_content', '')
        category = j.get('consulting_category', '')

    rows.append({
        'session_id': sid,
        'consulting_content': content,
        'consulting_category': category
    })

df_meta = pd.DataFrame(rows)


# -----------------------------------------------------------------------------
# 2) 기존 텍스트 특성 CSV 불러오기
# -----------------------------------------------------------------------------
df_feats = pd.read_csv(
    'output/text_features_all_v3.csv',  # 실제 파일명으로 수정
    dtype={'session_id': str},
    encoding='utf-8-sig'
)


# -----------------------------------------------------------------------------
# 3) 레이블 CSV 불러오기 (session_id, result_label)
# -----------------------------------------------------------------------------
df_labels = pd.read_csv(
    'output/session_labels_v3.csv',  # 실제 파일명으로 수정
    dtype={'session_id': str},
    encoding='utf-8-sig'
)


# -----------------------------------------------------------------------------
# 4) 세 DataFrame을 session_id 기준으로 병합
# -----------------------------------------------------------------------------
df = (
    df_feats
      .merge(df_meta,   on='session_id', how='left')
      .merge(df_labels, on='session_id', how='inner')
)


# -----------------------------------------------------------------------------
# 5) 훈련/검증/테스트 데이터 분할 (stratify 유지)
# -----------------------------------------------------------------------------
train_val, test = train_test_split(
    df,
    test_size=0.2,
    stratify=df['result_label'],
    random_state=42
)
train, val = train_test_split(
    train_val,
    test_size=0.25,
    stratify=train_val['result_label'],
    random_state=42
)


# -----------------------------------------------------------------------------
# 6) result_label → label_id로 인코딩
# -----------------------------------------------------------------------------
le = LabelEncoder().fit(train['result_label'])
for d in (train, val, test):
    d['label_id'] = le.transform(d['result_label'])


# -----------------------------------------------------------------------------
# 7) TF–IDF 벡터화 (형태소 명사만 사용) with tqdm
# -----------------------------------------------------------------------------
okt = Okt()
def noun_tokenizer(text):
    return okt.nouns(text)

tfidf = TfidfVectorizer(
    tokenizer=noun_tokenizer,
    max_features=5000,
    ngram_range=(1,2),
    min_df=5
)

X_tfidf_train = tfidf.fit_transform(
    tqdm(train['consulting_content'], desc='2. TFIDF fit')
)
X_tfidf_val = tfidf.transform(
    tqdm(val['consulting_content'], desc='3. TFIDF val')
)
X_tfidf_test = tfidf.transform(
    tqdm(test['consulting_content'], desc='4. TFIDF test')
)


# -----------------------------------------------------------------------------
# 8) 메타 피처 준비 (categorical → 숫자 ID 포함)
# -----------------------------------------------------------------------------
# (8-1) 분류·장소 ID 인코딩
for col in tqdm(['consulting_category', 'rec_place'], desc='5. LabelEncode meta'):
    le_col = LabelEncoder().fit(df[col].astype(str))
    for d in (train, val, test):
        d[f'{col}_id'] = le_col.transform(d[col].astype(str))

# (8-2) 실제 있는 감정 피처명을 prefix로 반영
# → train.columns 확인해서 맞춰주세요
print("→ actual train.columns:\n", train.columns.tolist())

desired_meta_cols = [
    'speech_count',
    # 고객 감정 점수
    *[f'고객_emo_{i}_star_score' for i in range(1,6)],
    '고객_sent_score',
    # 상담사 감정 점수
    *[f'상담사_emo_{i}_star_score' for i in range(1,6)],
    '상담사_sent_score',
    # 분류 ID
    'consulting_category_id',
    'rec_place_id'
]

# 실제 train 에 있는 것만 골라냅니다
meta_cols = [c for c in desired_meta_cols if c in train.columns]
print("→ 사용할 메타 피처:", meta_cols)

# numpy array 로 변환
X_meta_train = train[meta_cols].values
X_meta_val   =   val[meta_cols].values
X_meta_test  =  test[meta_cols].values


# -----------------------------------------------------------------------------
# 9) 최종 feature & label 준비 (TFIDF + 메타)
# -----------------------------------------------------------------------------
X_train, y_train = (
    np.hstack([X_tfidf_train.toarray(), X_meta_train]),
    train['label_id']
)
X_val, y_val = (
    np.hstack([X_tfidf_val.toarray(), X_meta_val]),
    val['label_id']
)
X_test, y_test = (
    np.hstack([X_tfidf_test.toarray(), X_meta_test]),
    test['label_id']
)


# -----------------------------------------------------------------------------
# 10) LightGBM 베이스라인 학습 (early stopping & 로그)
# -----------------------------------------------------------------------------
model = LGBMClassifier(
    objective='multiclass',
    num_class=len(le.classes_),
    n_estimators=200,
    learning_rate=0.05,
    random_state=42
)

print("▶ 6. LightGBM 학습 중…")
model.fit(
    X_train, y_train,
    eval_set=[(X_train, y_train), (X_val, y_val)],
    eval_metric='multi_logloss',
    callbacks=[
        lgb.early_stopping(stopping_rounds=10),
        lgb.log_evaluation(period=20)
    ]
)


# -----------------------------------------------------------------------------
# 11) 검증·테스트 세트 평가
# -----------------------------------------------------------------------------
print("\n--- 검증 세트 평가 ---")
pred_val = model.predict(X_val)
print(f"Accuracy: {accuracy_score(y_val, pred_val):.4f}")
print(classification_report(y_val, pred_val, target_names=le.classes_))

print("\n--- 테스트 세트 평가 ---")
pred_test = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, pred_test):.4f}")
print(classification_report(y_test, pred_test, target_names=le.classes_))

1. JSON → DataFrame:   0%|          | 0/3533 [00:00<?, ?it/s]

2. TFIDF fit:   0%|          | 0/2119 [00:00<?, ?it/s]



3. TFIDF val:   0%|          | 0/707 [00:00<?, ?it/s]

4. TFIDF test:   0%|          | 0/707 [00:00<?, ?it/s]

5. LabelEncode meta:   0%|          | 0/2 [00:00<?, ?it/s]

→ actual train.columns:
 ['session_id', 'speech_count', 'asr_segments', 'top_nouns', '고객_emo_1_star_score', '고객_emo_2_star_score', '고객_emo_3_star_score', '고객_emo_4_star_score', '고객_emo_5_star_score', '고객_sent_score', '고객_sent_label', '상담사_emo_1_star_score', '상담사_emo_2_star_score', '상담사_emo_3_star_score', '상담사_emo_4_star_score', '상담사_emo_5_star_score', '상담사_sent_score', '상담사_sent_label', 'mid_category', 'content_category', 'rec_place', 'script_phrase_ratio', 'honorific_ratio', 'positive_word_ratio', 'euphonious_word_ratio', 'confirmation_ratio', 'empathy_ratio', 'apology_ratio', 'request_ratio', 'alternative_suggestion_count', 'conflict_flag', 'manual_compliance_ratio', 'consulting_content', 'consulting_category', 'result_label', 'label_id', 'consulting_category_id', 'rec_place_id']
→ 사용할 메타 피처: ['speech_count', '고객_emo_1_star_score', '고객_emo_2_star_score', '고객_emo_3_star_score', '고객_emo_4_star_score', '고객_emo_5_star_score', '고객_sent_score', '상담사_emo_1_star_score', '상담사_emo_2_star_score

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
