In [None]:
import json
import re
import ast
import pandas as pd

def replace_strings(text, replacements):
    for old, new in replacements.items():
        text = text.replace(old, new)
        
    return text

def clean_segment(segment):
    """
    Given a segment that is expected to be quoted (i.e. begins and ends with
    the same single or double quote), remove any occurrences of that quote
    from the inner content.
    For example, if segment is:
         "에이닷 T 멤버십 쿠폰함에 "에이닷은통화요약된닷" 입력"
    then the outer quotes are preserved but the inner double quotes are removed.
    """
    segment = segment.strip()
    if len(segment) >= 2 and segment[0] in ['"', "'"] and segment[-1] == segment[0]:
        q = segment[0]
        # Remove inner occurrences of the quote character.
        inner = segment[1:-1].replace(q, '')
        return q + inner + q
    return segment


def split_key_value(text):
    """
    Splits text into key and value based on the first colon that appears
    outside any quoted region.
    If no colon is found outside quotes, the value will be returned empty.
    """
    in_quote = False
    quote_char = ''
    for i, char in enumerate(text):
        if char in ['"', "'"]:
            # Toggle quote state (assumes well-formed starting/ending quotes for each token)
            if in_quote:
                if char == quote_char:
                    in_quote = False
                    quote_char = ''
            else:
                in_quote = True
                quote_char = char
        elif char == ':' and not in_quote:
            return text[:i], text[i+1:]
    return text, ''

def split_outside_quotes(text, delimiter=','):
    """
    Splits the input text on the given delimiter (default comma) but only
    if the delimiter occurs outside of quoted segments.
    Returns a list of parts.
    """
    parts = []
    current = []
    in_quote = False
    quote_char = ''
    for char in text:
        if char in ['"', "'"]:
            # When encountering a quote, toggle our state
            if in_quote:
                if char == quote_char:
                    in_quote = False
                    quote_char = ''
            else:
                in_quote = True
                quote_char = char
            current.append(char)
        elif char == delimiter and not in_quote:
            parts.append(''.join(current).strip())
            current = []
        else:
            current.append(char)
    if current:
        parts.append(''.join(current).strip())
    return parts


def clean_ill_structured_json(text):
    """
    Given a string that is intended to represent a JSON-like structure
    but may be ill-formed (for example, it might contain nested quotes that
    break standard JSON rules), attempt to “clean” it by processing each
    key–value pair.
    
    The function uses the following heuristics:
      1. Split the input text into comma-separated parts (only splitting
         when the comma is not inside a quoted string).
      2. For each part, split on the first colon (that is outside quotes) to separate key and value.
      3. For any segment that begins and ends with a quote, remove any inner occurrences
         of that same quote.
      4. Rejoin the cleaned key and value.
    
    Note: This approach does not build a fully robust JSON parser. For very complex
          or deeply nested ill-structured inputs further refinement would be needed.
    """
    # First, split the text by commas outside of quotes.
    parts = split_outside_quotes(text, delimiter=',')
    
    cleaned_parts = []
    for part in parts:
        # Try to split into key and value on the first colon not inside quotes.
        key, value = split_key_value(part)
        key_clean = clean_segment(key)
        value_clean = clean_segment(value) if value.strip() != "" else ""
        if value_clean:
            cleaned_parts.append(f"{key_clean}: {value_clean}")
        else:
            cleaned_parts.append(key_clean)
    
    # Rejoin the cleaned parts with commas (or you can use another format if desired)
    return ', '.join(cleaned_parts)

def repair_json(broken_json):
    
    # json_str = broken_json.replace("'",'"')
    
    # Fix unquoted values (like NI00001863)
    json_str = re.sub(r':\s*([a-zA-Z0-9_]+)(\s*[,}])', r': "\1"\2', broken_json)
    
    # Fix unquoted keys
    json_str = re.sub(r'([{,])\s*([a-zA-Z0-9_]+):', r'\1 "\2":', json_str)
    
    # Fix trailing commas
    json_str = re.sub(r',\s*}', '}', json_str)
    
    return json_str

def extract_json_objects(text):
    # More sophisticated pattern that tries to match proper JSON syntax
    pattern = r'(\{(?:[^{}]|(?:\{(?:[^{}]|(?:\{[^{}]*\}))*\}))*\})'
    
    result = []
    for match in re.finditer(pattern, text):
        potential_json = match.group(0)
        try:
            # Try to parse and validate
            # json_obj = json.loads(repair_json(potential_json))
            json_obj = ast.literal_eval(clean_ill_structured_json(repair_json(potential_json)))
            result.append(json_obj)
        except json.JSONDecodeError:
            # Not valid JSON, skip
            pass
    
    return result       


In [24]:
%set_env ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
%set_env LANGSMITH_TRACING=true
%set_env LANGSMITH_API_KEY=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
%set_env TAVILY_API_KEY=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


%set_env OPENAI_API_KEY=${OPENAI_API_KEY}
%set_env LANGCHAIN_API_KEY=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

%set_env LANGCHAIN_TRACING_V2=true
%set_env LANGCHAIN_PROJECT="Multi-agent Collaboration"

env: ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
env: LANGSMITH_TRACING=true
env: LANGSMITH_API_KEY=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
env: TAVILY_API_KEY=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
env: OPENAI_API_KEY=${OPENAI_API_KEY}
env: LANGCHAIN_API_KEY=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
env: LANGCHAIN_TRACING_V2=true
env: LANGCHAIN_PROJECT="Multi-agent Collaboration"


In [25]:
from openai import OpenAI

# llm_api_key = "sk-gapk-GHjvOA9hXL8MQ7yNNlR7kLmfA-f8fSl6"  #우리꺼
# llm_api_key = "sk-gapk-1tQCB4O2KnH5GG68DeUKfjAKQ-vJ9kc9" #도현님
llm_api_key = "sk-gapk-Y70vdkPbXPRMWHK0dtaYU30hw-bi7B5C" # 빌린거
llm_api_url = "https://api.platform.a15t.com/v1"


client = OpenAI(
    api_key = llm_api_key,
    base_url = llm_api_url
)

# from langchain.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain_anthropic import ChatAnthropic
from langchain.schema import AIMessage, HumanMessage, SystemMessage
import pandas as pd

def ChatAnthropicSKT(model="skt/claude-3-5-sonnet-20241022", max_tokens=100):
    # llm_api_key = "sk-gapk-GHjvOA9hXL8MQ7yNNlR7kLmfA-f8fSl6" #우리꺼 # "sk-gapk-1tQCB4O2KnH5GG68DeUKfjAKQ-vJ9kc9"
    # llm_api_key = "sk-gapk-1tQCB4O2KnH5GG68DeUKfjAKQ-vJ9kc9" #도현님  # "sk-gapk-1tQCB4O2KnH5GG68DeUKfjAKQ-vJ9kc9"
    llm_api_key = "sk-gapk-Y70vdkPbXPRMWHK0dtaYU30hw-bi7B5C" # 빌린거
    
    llm_api_url = "https://api.platform.a15t.com/v1"
    
    # llm_api_url = "https://43.203.77.11:443/v1"

    # model = "anthropic/claude-3-5-sonnet-20240620"

    model = ChatOpenAI(
        temperature=0,  
        openai_api_key=llm_api_key, 
        openai_api_base=llm_api_url, 
        model=model,
        max_tokens=max_tokens
        )
    return model

llm_cld35 = ChatAnthropicSKT()

llm_cld37 = ChatAnthropic(
    api_key=os.getenv("ANTHROPIC_API_KEY"),
    model="claude-3-7-sonnet-20250219",
    max_tokens=3000
)

llm_chat = ChatOpenAI(
        temperature=0,  
        model="gpt-4o",
        openai_api_key=os.getenv("OPENAI_API_KEY"),
        max_tokens=2000,
)



In [75]:
mms_pdf = pd.read_csv("./data/mms_data_250408.csv")
mms_pdf['msg'] = mms_pdf['msg_nm']+"\n"+mms_pdf['mms_phrs']
mms_pdf = mms_pdf.groupby(["msg_nm","mms_phrs","msg"])['offer_dt'].min().reset_index(name="offer_dt")
mms_pdf = mms_pdf.reset_index()
mms_pdf = mms_pdf.astype('str')

pgm_pdf = pd.read_csv("./data/tos_pgm_20250516.csv")

pgm_pdf_v2 = pgm_pdf.copy()

In [65]:
msg_str = "\n".join(mms_pdf.query("offer_dt>='20240101'")[['msg_nm','msg']].drop_duplicates('msg_nm')[['msg']].sample(n=200)['msg'].to_list())

In [30]:
pgm_md = pgm_pdf.rename(columns={'To-Be PGM':'label','인식 정보 (작성필요여부?)':'label_tag'}).to_markdown()

In [70]:
prompt = f"""
나는 광고 메세지를 분류하는 작업을 하고 있습니다.
내 계획은 광고 메세지와 분류 레벨 리스트를 주고 광고 메세지에 분류 label을 부여하는 것입니다.
그러나, 분류 작업에 LLM이 참고하라는 clue word로서 각 label별로 tag를 제공하고 싶다.
일단, 내가 임의로 초기값으로 몇개의 tag를 만들었는데, 당신이 더 추가해주면 좋겠다.
아래에 메세지들의 예시와 각 label별로 tag가 있다.

### 작업 목표 ###
* tag의 목적은 분류 작업에 LLM이 참고하라는 clue word로서 제공하는 것이다. 따라서, 특정 상품이나 서비스 이름이 아니라, 좀 더 일반적인 것으로 제공해주면 좋겠다.
* label별로 더 좋은 tag를 메세지에 나오거나 나올 가능성이 높은 것으로 추가해주면 좋겠다.
* 추가 tag에 초기 tag도 포함되어 있으면 좋겠다.
* Pandas dataframe으로 변환할 수 있도록 JSON 형식으로 반환해주면 좋겠다.

결과 Schema:
{{"label":label, "tag":tag}}

메세지 예시:
{msg_str}

각 label별로 tag:
{pgm_md}
"""

result_cld = llm_cld37.invoke(prompt).content

In [71]:

json_cld = extract_json_objects(result_cld)
cld_pdf = pd.DataFrame(json_cld)
cld_pdf.to_csv("./data/pgm_tag_cld_v2.csv", index=False)
cld_pdf.to_excel(f"./data/pgm_tag_cld_v2.xlsx", index=False, engine='openpyxl')

cld_pdf

Unnamed: 0,label,tag
0,[비마케팅]필수고지사항안내,"114, 고객, 안내, 변경, 업데이트, 정책, 중요, 공지"
1,[비마케팅]고객설문조사,"114, 고객, 설문, 만족도, 조사, 참여, 의견, 피드백, 개선"
2,[비마케팅]대고객사과및오발송정정안내,"사과, 오류, 불편, 죄송, 114, 정정, 오발송, 수정, 해명"
3,[비마케팅]내부구성원대상안내,"구성원, 6100-0000, 내부, 직원, 임직원, 사내, 공지"
4,[비마케팅]상품및부가서비스가입유도_준실시간,"상품, 서비스, 혜택, 가입, 추천, 안내, 업데이트"
5,[비마케팅]기변유도및해지방어_준실시간,"기변, 해지, 약정, 요금제, 단말기, 갱신, 혜택, 연장"
6,[비마케팅]필수고지사항안내_정기일배치대상중복허용,"고지, 필수, 안내, 정기, 공지, 중요, 변경사항"
7,[마케팅_Sales]상품및부가서비스가입유도_단말,"신규, 출시, 갤럭시, Iphone, 사전예약, 할인, 프로모션, 구매, 단말기, 폴더블"
8,[마케팅_Sales]상품및부가서비스가입유도_유선,"결합, 인터넷, 할인, IPTV, TV, 기가, 초고속, 와이파이, 약정"
9,[마케팅_Sales]상품및부가서비스가입유도_요금제,"data, 요금제, 5G, 5GX, 요금, 무제한, 선택약정, 프라임, 슬림, 할인"


In [76]:
# pgm_pdf.merge(cld_pdf.rename(columns={'label':'To-Be PGM','tag':'tag_from_llm'}), on='To-Be PGM', how='left')
pgm_pdf_v2['tag_from_llm'] = cld_pdf['tag']
pgm_pdf_v2.to_excel(f"./data/pgm_tag_ext_250516.xlsx", index=False, engine='openpyxl')

In [61]:
result_gpt = llm_chat.invoke(prompt).content

In [62]:
json_gpt = extract_json_objects(result_gpt)
gpt_pdf = pd.DataFrame(json_gpt)
gpt_pdf

Unnamed: 0,label,tag
0,[비마케팅]필수고지사항안내,"[114, 고객, 안내]"
1,[비마케팅]고객설문조사,"[114, 고객, 설문, 만족도, 조사]"
2,[비마케팅]대고객사과및오발송정정안내,"[사과, 오류, 불편, 죄송, 114]"
3,[비마케팅]내부구성원대상안내,"[구성원, 6100-0000]"
4,[비마케팅]상품및부가서비스가입유도_준실시간,"[미추천, 기획자가 넣음]"
5,[비마케팅]기변유도및해지방어_준실시간,"[미추천, 기획자가 넣음]"
6,[비마케팅]필수고지사항안내_정기일배치대상중복허용,"[미추천, 기획자가 넣음]"
7,[마케팅_Sales]상품및부가서비스가입유도_단말,"[신규, 출시, 갤럭시, Iphone, 사전예약]"
8,[마케팅_Sales]상품및부가서비스가입유도_유선,"[결합, 인터넷, 할인]"
9,[마케팅_Sales]상품및부가서비스가입유도_요금제,"[data, 요금제, 5G, 5GX, 요금]"
