## 텍스트 기반 특성 추출 - 샘플 버전

In [1]:
!pip install transformers


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## json_merge/integration_data 기반으로 데이터셋 제작

In [11]:
import os
import glob
import json
import re
from collections import Counter
from tqdm import tqdm
import pandas as pd
from konlpy.tag import Okt
from transformers import pipeline

# ——— 세팅 ———
okt = Okt()
sentiment = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    top_k=None
)

# 키워드 사전 (예시 — 실제 프로젝트에 맞게 확장하세요)
script_phrases       = ["안녕하세요", "감사합니다", "반갑습니다"]
honorific_endings    = ["습니다", "세요", "니다"]
positive_words       = ["좋다", "만족", "행복", "감사"]
euphonious_words     = ["해주", "드리"]
confirmation_phrases = ["확인", "맞으신", "괜찮을까요"]
empathy_phrases      = ["이해합니다", "공감"]
apology_phrases      = ["죄송", "미안"]
request_phrases      = ["부탁", "요청"]
alternative_phrases  = ["하실 수 있습니다", "제안합니다", "추천드립니다"]
conflict_words       = ["아닙니다", "불가", "불편"]
prohibit_words       = ["욕설1", "욕설2"]

def split_sentences(text):
    return re.split(r'(?<=[\.!\?])\s+', text)

def extract_text_features(record):
    content = record['consulting_content']
    # 1) ASR 세그먼트 & 발화 수
    segments = []
    for line in content.split('\n'):
        m = re.match(r'^(상담사|고객):\s*(.+)', line)
        if m:
            segments.append({'speaker': m.group(1), 'text': m.group(2)})
    speech_count = len(segments)
    # 2) top_nouns
    all_nouns = []
    for seg in segments:
        all_nouns += okt.nouns(seg['text'])
    top_nouns = [w for w,_ in Counter(all_nouns).most_common(10)]
    # 3) 감정 분석 (문장별 평균)
    agg = Counter({i:0.0 for i in range(1,6)})
    n_sents = 0
    for sent in split_sentences(content):
        if not sent.strip(): continue
        scores = sentiment(sent)[0]
        for d in scores:
            star = int(d['label'][0])
            agg[star] += d['score']
        n_sents += 1
    emo = {f'emo_{i}_star_score': (agg[i]/n_sents if n_sents else 0.0) for i in range(1,6)}
    sent_score = sum(i * emo[f'emo_{i}_star_score'] for i in range(1,6))
    if   sent_score >= 3.5: sent_label = "긍정"
    elif sent_score >= 2.5: sent_label = "중립"
    else:                  sent_label = "부정"
    # 4) 분류: 상담 주제 / 상담 내용
    mid_category = None
    content_category = None
    for inst in record.get('instructions', []):
        items = inst.get('data', [inst])
        for d in items:
            if d.get('task_category') == '상담 주제':
                mid_category = d.get('output')
            elif d.get('task_category') == '상담 내용':
                content_category = d.get('output')
    # 5) 대화 장소
    pm = re.search(r'(\w+(센터|매장|지점))', content)
    rec_place = pm.group(1) if pm else None
    # 6) 비율/카운트 헬퍼
    def ratio(keys):
        tot = len(content.split())
        return sum(content.count(k) for k in keys) / tot if tot else 0
    def count(keys):
        return sum(content.count(k) for k in keys)
    # 7) 기타 지표 결합
    features = {
        'session_id':                   record['session_id'],
        'speech_count':                 speech_count,
        'asr_segments':                 segments,
        'top_nouns':                    ','.join(top_nouns),
        **emo,
        'sent_score':                   sent_score,
        'sent_label':                   sent_label,
        'mid_category':                 mid_category,
        'content_category':             content_category,
        'rec_place':                    rec_place,
        'script_phrase_ratio':          ratio(script_phrases),
        'honorific_ratio':              ratio(honorific_endings),
        'positive_word_ratio':          ratio(positive_words),
        'euphonious_word_ratio':        ratio(euphonious_words),
        'confirmation_ratio':           ratio(confirmation_phrases),
        'empathy_ratio':                ratio(empathy_phrases),
        'apology_ratio':                ratio(apology_phrases),
        'request_ratio':                ratio(request_phrases),
        'alternative_suggestion_count': count(alternative_phrases),
        'conflict_flag':                int(any(w in content for w in conflict_words)),
        'manual_compliance_ratio':      1 - (count(prohibit_words) / max(1, speech_count))
    }
    return features

# ——— output_final 내 모든 JSON 파일 순회 & 처리 ———
all_files = glob.glob('output_final_2/**/*.json', recursive=True)
rows = []
for fp in tqdm(all_files, desc='Processing all sessions'):
    with open(fp, 'r', encoding='utf-8') as f:
        rec = json.load(f)
    rows.append(extract_text_features(rec))

# ——— DataFrame 생성 및 CSV 저장 ———
df = pd.DataFrame(rows)
os.makedirs('output_columns_2', exist_ok=True)
df.to_csv('output_columns_2/text_features_all.csv', index=False, encoding='utf-8-sig')

print(f'✅ 모든 세션 처리 완료, CSV → output/text_features_all.csv')

Device set to use mps:0
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Processing all sessions: 100%|████████████████| 379/379 [04:29<00:00,  1.40it/s]

✅ 모든 세션 처리 완료, CSV → output/text_features_all.csv





In [14]:
# ——— DataFrame & CSV 저장 ———
df = pd.DataFrame(rows)

out_dir = 'output'
os.makedirs(out_dir, exist_ok=True)

csv_path = os.path.join(out_dir, 'text_features_all.csv')
df.to_csv(csv_path, index=False, encoding='utf-8-sig')

print(f'✅ 모든 세션 처리 완료, CSV → {csv_path}')

✅ 모든 세션 처리 완료, CSV → output/text_features_all.csv


#### 데이터셋 제작 새 코드 (샘플로 5개 시행)

In [13]:
# -*- coding: utf-8 -*-
import os, glob, json, re
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import pandas as pd
from konlpy.tag import Okt
from transformers import pipeline

# ——— 설정 ———
okt = Okt()
sentiment = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    top_k=None
)

# 키워드 사전 (예시)
script_phrases       = ["안녕하세요", "감사합니다", "반갑습니다"]
honorific_endings    = ["습니다", "세요", "니다"]
positive_words       = ["좋다", "만족", "행복", "감사"]
euphonious_words     = ["해주", "드리"]
confirmation_phrases = ["확인", "맞으신", "괜찮을까요"]
empathy_phrases      = ["이해합니다", "공감"]
apology_phrases      = ["죄송", "미안"]
request_phrases      = ["부탁", "요청"]
alternative_phrases  = ["하실 수 있습니다", "제안합니다", "추천드립니다"]
conflict_words       = ["아닙니다", "불가", "불편"]
prohibit_words       = ["욕설1", "욕설2"]

# ——— 문장 분리 유틸 ———
def split_sentences(text):
    return re.split(r'(?<=[\.!\?])\s+', text)

# ——— 화자별 감정분석 함수 ———
def calc_speaker_emotion(content, speaker_tag):
    lines = [
        line[len(speaker_tag)+1:].strip()
        for line in content.split('\n')
        if line.startswith(f'{speaker_tag}:')
    ]
    sents = []
    for ln in lines:
        for sent in split_sentences(ln):
            if sent.strip():
                sents.append(sent)
    agg = defaultdict(float)
    for sent in tqdm(sents, desc=f'{speaker_tag} 감정분석', leave=False):
        scores = sentiment(sent)[0]
        for d in scores:
            star = int(d['label'][0])
            agg[star] += d['score']
    n = len(sents) or 1
    star_scores = {f'{speaker_tag}_emo_{i}_star_score': agg[i]/n for i in range(1,6)}
    sent_score = sum(i * star_scores[f'{speaker_tag}_emo_{i}_star_score'] for i in range(1,6))
    if   sent_score >= 3.5: label = "긍정"
    elif sent_score >= 2.5: label = "중립"
    else:                  label = "부정"
    star_scores[f'{speaker_tag}_sent_score'] = sent_score
    star_scores[f'{speaker_tag}_sent_label'] = label
    return star_scores

# ——— 세션 특성추출 함수 ———
def extract_text_features(record):
    content = record['consulting_content']
    sid = record['session_id']

    # 1) ASR 세그먼트 & speech_count
    segments = []
    for line in content.split('\n'):
        m = re.match(r'^(상담사|고객):\s*(.+)', line)
        if m:
            segments.append({'speaker': m.group(1), 'text': m.group(2)})
    speech_count = len(segments)

    # 2) 상위 명사
    all_nouns = []
    for seg in segments:
        all_nouns += okt.nouns(seg['text'])
    top_nouns = [w for w,_ in Counter(all_nouns).most_common(10)]

    # 3) 세션 전체 감정 (문장별 평균)
    agg = Counter({i:0.0 for i in range(1,6)})
    n_sents = 0
    for sent in split_sentences(content):
        if not sent.strip(): continue
        scores = sentiment(sent)[0]
        for d in scores:
            star = int(d['label'][0])
            agg[star] += d['score']
        n_sents += 1
    emo = {f'emo_{i}_star_score': (agg[i]/n_sents if n_sents else 0.0) for i in range(1,6)}
    sent_score = sum(i * emo[f'emo_{i}_star_score'] for i in range(1,6))
    if   sent_score >= 3.5: sent_label = "긍정"
    elif sent_score >= 2.5: sent_label = "중립"
    else:                  sent_label = "부정"

    # 4) 지시문(instructions)에서 분류 메타
    mid_cat = None
    cont_cat = None
    for inst in record.get('instructions', []):
        for d in inst.get('data', [inst]):
            if d.get('task_category') == '상담 주제':
                mid_cat = d.get('output')
            elif d.get('task_category') == '상담 내용':
                cont_cat = d.get('output')

    # 5) 비율/카운트 헬퍼
    def ratio(keys):
        tot = len(content.split())
        return sum(content.count(k) for k in keys) / tot if tot else 0
    def count(keys):
        return sum(content.count(k) for k in keys)

    # ——— 기본 피처
    feats = {
        'session_id':                  sid,
        'speech_count':                speech_count,
        'top_nouns':                   ','.join(top_nouns),
        **emo,
        'sent_score':                  sent_score,
        'sent_label':                  sent_label,
        'mid_category':                mid_cat,
        'content_category':            cont_cat,
        'script_phrase_ratio':         ratio(script_phrases),
        'honorific_ratio':             ratio(honorific_endings),
        'positive_word_ratio':         ratio(positive_words),
        'euphonious_word_ratio':       ratio(euphonious_words),
        'confirmation_ratio':          ratio(confirmation_phrases),
        'empathy_ratio':               ratio(empathy_phrases),
        'apology_ratio':               ratio(apology_phrases),
        'request_ratio':               ratio(request_phrases),
        'alternative_suggestion_count':count(alternative_phrases),
        'conflict_flag':               int(any(w in content for w in conflict_words)),
        'manual_compliance_ratio':     1 - (count(prohibit_words)/max(1, speech_count))
    }

    # 6) 고객/상담사별 감정 추가
    feats.update(calc_speaker_emotion(content, '고객'))
    feats.update(calc_speaker_emotion(content, '상담사'))

    return feats

# ——— 실행 예시: 샘플 5개 추출 및 저장 ———
files = glob.glob('json_merge/integration_data_v3/final_merged_*.json')[:5]
rows = []
for fp in tqdm(files, desc='샘플 5개 처리'):
    rec = json.load(open(fp, 'r', encoding='utf-8'))
    rec['consulting_content'] = rec.get('consulting_content') or rec.get('classification',{}).get('consulting_content','')
    rec['instructions'] = rec.get('instructions', [])
    rows.append(extract_text_features(rec))

df = pd.DataFrame(rows)
os.makedirs('coloumns_extracion/sample', exist_ok=True)
df.to_csv('coloumns_extracion/sample/text_features_sample.csv', index=False, encoding='utf-8-sig')
print("✅ 샘플 특성추출 완료 → coloumns_extracion/sample/text_features_sample.csv")

Device set to use mps:0


샘플 5개 처리:   0%|          | 0/5 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/30 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/40 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/28 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/37 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/36 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/37 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/34 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/28 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/18 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/19 [00:00<?, ?it/s]

✅ 샘플 특성추출 완료 → coloumns_extracion/sample/text_features_sample.csv


In [17]:
# -*- coding: utf-8 -*-
import os, glob, json, re
from collections import Counter, defaultdict
from tqdm.auto import tqdm
import pandas as pd
from konlpy.tag import Okt
from transformers import pipeline

# ——— 설정 ———
okt = Okt()
sentiment = pipeline(
    'sentiment-analysis',
    model='nlptown/bert-base-multilingual-uncased-sentiment',
    top_k=None
)

# 키워드 사전 (예시)
script_phrases       = ["안녕하세요", "감사합니다", "반갑습니다"]
honorific_endings    = ["습니다", "세요", "니다"]
positive_words       = ["좋다", "만족", "행복", "감사"]
euphonious_words     = ["해주", "드리"]
confirmation_phrases = ["확인", "맞으신", "괜찮을까요"]
empathy_phrases      = ["이해합니다", "공감"]
apology_phrases      = ["죄송", "미안"]
request_phrases      = ["부탁", "요청"]
alternative_phrases  = ["하실 수 있습니다", "제안합니다", "추천드립니다"]
conflict_words       = ["아닙니다", "불가", "불편"]
prohibit_words       = ["욕설1", "욕설2"]

# ——— 문장 분리 유틸 ———
def split_sentences(text):
    return re.split(r'(?<=[\.!\?])\s+', text)

# ——— 화자별 감정분석 함수 ———
def calc_speaker_emotion(content, speaker_tag):
    lines = [
        line[len(speaker_tag)+1:].strip()
        for line in content.split('\n')
        if line.startswith(f'{speaker_tag}:')
    ]
    sents = []
    for ln in lines:
        for sent in split_sentences(ln):
            if sent.strip():
                sents.append(sent)
    agg = defaultdict(float)
    for sent in tqdm(sents, desc=f'{speaker_tag} 감정분석', leave=False):
        scores = sentiment(sent)[0]
        for d in scores:
            star = int(d['label'][0])
            agg[star] += d['score']
    n = len(sents) or 1
    star_scores = {f'{speaker_tag}_emo_{i}_star_score': agg[i]/n for i in range(1,6)}
    sent_score = sum(i * star_scores[f'{speaker_tag}_emo_{i}_star_score'] for i in range(1,6))
    if   sent_score >= 3.5: label = "긍정"
    elif sent_score >= 2.5: label = "중립"
    else:                  label = "부정"
    star_scores[f'{speaker_tag}_sent_score'] = sent_score
    star_scores[f'{speaker_tag}_sent_label'] = label
    return star_scores

# ——— 세션 특성추출 함수 ———
def extract_text_features(record):
    content = record['consulting_content']
    sid = record['session_id']

    # 1) ASR 세그먼트 & speech_count
    segments = []
    for line in content.split('\n'):
        m = re.match(r'^(상담사|고객):\s*(.+)', line)
        if m:
            segments.append({'speaker': m.group(1), 'text': m.group(2)})
    speech_count = len(segments)

    # 2) 상위 명사
    all_nouns = []
    for seg in segments:
        all_nouns += okt.nouns(seg['text'])
    top_nouns = [w for w,_ in Counter(all_nouns).most_common(10)]

    # 3) 세션 전체 감정 (문장별 평균)
    agg = Counter({i:0.0 for i in range(1,6)})
    n_sents = 0
    for sent in split_sentences(content):
        if not sent.strip(): continue
        scores = sentiment(sent)[0]
        for d in scores:
            star = int(d['label'][0])
            agg[star] += d['score']
        n_sents += 1
    emo = {f'emo_{i}_star_score': (agg[i]/n_sents if n_sents else 0.0) for i in range(1,6)}
    sent_score = sum(i * emo[f'emo_{i}_star_score'] for i in range(1,6))
    if   sent_score >= 3.5: sent_label = "긍정"
    elif sent_score >= 2.5: sent_label = "중립"
    else:                  sent_label = "부정"

    # 4) 지시문(instructions)에서 분류 메타
    mid_cat = None
    cont_cat = None
    for inst in record.get('instructions', []):
        for d in inst.get('data', [inst]):
            if d.get('task_category') == '상담 주제':
                mid_cat = d.get('output')
            elif d.get('task_category') == '상담 내용':
                cont_cat = d.get('output')

    # 5) 비율/카운트 헬퍼
    def ratio(keys):
        tot = len(content.split())
        return sum(content.count(k) for k in keys) / tot if tot else 0
    def count(keys):
        return sum(content.count(k) for k in keys)

    # — 기본 피처
    feats = {
        'session_id':                  sid,
        'speech_count':                speech_count,
        'top_nouns':                   ','.join(top_nouns),
        **emo,
        'sent_score':                  sent_score,
        'sent_label':                  sent_label,
        'mid_category':                mid_cat,
        'content_category':            cont_cat,
        'script_phrase_ratio':         ratio(script_phrases),
        'honorific_ratio':             ratio(honorific_endings),
        'positive_word_ratio':         ratio(positive_words),
        'euphonious_word_ratio':       ratio(euphonious_words),
        'confirmation_ratio':          ratio(confirmation_phrases),
        'empathy_ratio':               ratio(empathy_phrases),
        'apology_ratio':               ratio(apology_phrases),
        'request_ratio':               ratio(request_phrases),
        'alternative_suggestion_count':count(alternative_phrases),
        'conflict_flag':               int(any(w in content for w in conflict_words)),
        'manual_compliance_ratio':     1 - (count(prohibit_words)/max(1, speech_count))
    }

    # 6) 고객/상담사별 감정 추가
    feats.update(calc_speaker_emotion(content, '고객'))
    feats.update(calc_speaker_emotion(content, '상담사'))

    return feats

# ——— 전체 JSON 파일 순회 & 처리 ———
files = glob.glob('json_merge/integration_data_v3/final_merged_*.json')  # 전체 파일
rows = []
for fp in tqdm(files, desc='전체 세션 처리'):
    rec = json.load(open(fp, 'r', encoding='utf-8'))
    # consulting_content / instructions 필드 보정
    rec['consulting_content'] = rec.get('consulting_content') \
        or rec.get('classification',{}).get('consulting_content','')
    rec['instructions'] = rec.get('instructions', [])
    rows.append(extract_text_features(rec))

# ——— DataFrame 생성 및 CSV 저장 ———
df = pd.DataFrame(rows)
os.makedirs('columns_extraction/all', exist_ok=True)
df.to_csv(
    'columns_extraction/all/text_features_all_v3.csv',
    index=False, encoding='utf-8-sig'
)
print("✅ 전체 특성추출 완료 → columns_extraction/all/text_features_all_v3.csv")

Device set to use mps:0


전체 세션 처리:   0%|          | 0/3533 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/30 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/40 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/28 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/37 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/36 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/37 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/34 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/28 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/18 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/19 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/30 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/35 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/26 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/31 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/24 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/36 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/30 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/42 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/28 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/41 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/29 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/43 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/41 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/50 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/32 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/38 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/19 [00:00<?, ?it/s]

상담사 감정분석:   0%|          | 0/26 [00:00<?, ?it/s]

고객 감정분석:   0%|          | 0/24 [00:00<?, ?it/s]

KeyboardInterrupt: 