# Team 전장갓겜 Inference

### Environment

OS: Ubuntu 20.04 LTS   
Pytorch: 2.0.1  
CUDA: 11.7  
cuDNN: 8

### Install Library

In [None]:
!pip install torch==2.0.1 torchvision==0.15.2 torchaudio==2.0.2

In [None]:
!pip install pandas sentence_transformers transformers==4.37.1 tqdm pyarrow wandb spacy matplotlib
!pip install bitsandbytes==0.41.1 accelerate==0.21.0 appdirs loralib black black[jupyter] datasets fire sentencepiece scipy numpy scikit-learn
!pip install git+https://github.com/huggingface/peft

### Import Library

In [None]:
import pandas as pd
import numpy as np
import torch
import re

import transformers
import datasets
from pathlib import Path
from datasets import Dataset
from transformers import AutoTokenizer
from transformers import pipeline
from sentence_transformers import SentenceTransformer

from tqdm import tqdm
tqdm.pandas()

# Preprocess

In [None]:
data = pd.read_csv('./train_keyword_DB_JJGG.csv')

def split_kw(keywords):
    return keywords.split(' ')

def split_df(df):
    df['키워드list'] = df['키워드'].apply(lambda x: split_kw(x))
    
split_df(data)

test_df = pd.read_csv('./test.csv')

new_rows = []  

# 행 하나씩 순회
for index, row in test_df.iterrows():
    question = row['질문']
    if not question.endswith(('.', '!', '?')):
        question += '.'
    
    split_questions = re.findall('.+?[?!.]', question)
    for q in split_questions:
        new_rows.append({'id': row['id'], '질문': q})


split_test_df = pd.DataFrame(new_rows)
split_test_df['new_id'] = range(len(split_test_df))
split_test_df = split_test_df[['id', 'new_id', '질문']]

split_test_df.to_csv('./test_splited.csv')

In [None]:
test_df = pd.read_csv("./test_splited.csv")

def check_kw(question):
    keyword_index = []
    for i in range(len(data)):
        k = 0 
        for j in data["키워드list"][i]:
            if j in question:
                k += 1
        if k == len(data["키워드list"][i]):
            keyword_index.append(i)            
    return keyword_index

def index_kw(df):
    df['keyword_match'] = df['질문'].apply(lambda x: check_kw(x))

index_kw(test_df)

def add_inst_1(df):
    for i in range(len(df)):
        # df["질문"][i] = "\n### 질문:" + df["질문"][i]
        for j in df["keyword_match"][i]:
            df["질문"][i] = data["답변_3"][j] + " " + df["질문"][i]
            
add_inst_1(test_df)

In [None]:
connected_q = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_q.append(i)
        
id_nums = []
questions = []
for j in connected_q:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    if more_than_two["keyword_match"].iloc[-1] == []:
        for k in range(1, len(more_than_two)):
            more_than_two["질문"].iloc[0] += more_than_two["질문"].iloc[k]
            test_df["질문"][more_than_two.index[0]] = more_than_two["질문"].iloc[0]
        for ind in range(1, len(more_than_two)):
            test_df.drop(index=more_than_two.index[:][ind], axis=0, inplace=True)
        questions.append(more_than_two["질문"].iloc[0])
        id_nums.append(j)
        
test_df.to_csv("drop_empty_keyword_splited_test_dab_3_1.csv")

In [None]:
data = pd.read_csv('./train_keyword_DB_JJGG.csv')
split_df(data)

test_df = pd.read_csv("./test_splited.csv")
index_kw(test_df)

def add_inst_2(df):
    for i in range(len(df)):
        df["질문"][i] = "\n### 질문:" + df["질문"][i]
        for j in df["keyword_match"][i]:
            df["질문"][i] = data["답변_3"][j] + " " + df["질문"][i]
add_inst_2(test_df)

In [None]:
connected_q = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_q.append(i)
        
id_nums = []
questions = []
for j in connected_q:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    if more_than_two["keyword_match"].iloc[-1] == []:
        for k in range(1, len(more_than_two)):
            more_than_two["질문"].iloc[0] += more_than_two["질문"].iloc[k]
            test_df["질문"][more_than_two.index[0]] = more_than_two["질문"].iloc[0]
        for ind in range(1, len(more_than_two)):
            test_df.drop(index=more_than_two.index[:][ind], axis=0, inplace=True)
        questions.append(more_than_two["질문"].iloc[0])
        id_nums.append(j)

test_df.to_csv("drop_empty_keyword_splited_test_dab_3_2.csv")

In [None]:
data = pd.read_csv('./train_keyword_DB_JJGG.csv')
split_df(data)

test_df = pd.read_csv('./test.csv')
def split_sentences_with_delimiters_and_concat(text):

    pattern = r'(까요[^\.!\?]|세요[^\.!\?]|건가요[^\.!\?]|한가요[^\.!\?]|나요[^\.!\?]|해줘[^\.!\?]|니까[^\.\?!])'
    parts = re.split(pattern, text)

    result = [part.strip() for part in parts if part.strip()]
    if len(result) > 1:
        result[0] += result[1]
        result.pop(1)

    return result

In [None]:
new_rows = []
for index, row in test_df.iterrows():
    question = row['질문']
    if not question.endswith(('.', '!', '?')):
        question += '.'

    split_questions = re.findall(r'(.+?까요[^.?!]|.+?세요[^.?!]|.+?가요[^.?!]|.+?나요[^.?!]|.+?해줘[^.?!]|.+?니까[^.?!]|.+?[?!.])', question)
    for q in split_questions:
        new_rows.append({'id': row['id'], '질문': q})


split_test_df = pd.DataFrame(new_rows)
split_test_df['new_id'] = range(len(split_test_df))
split_test_df = split_test_df[['id', 'new_id', '질문']]

split_test_df.to_csv('./test_splited_kkayo.csv')

In [None]:
test_df = pd.read_csv("./test_splited_kkayo.csv")
index_kw(test_df)

connected_q = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_q.append(i)
        
id_nums = []
questions = []
for j in connected_q:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    if more_than_two["keyword_match"].iloc[-1] == []:
        more_than_two["질문"].iloc[-2] += more_than_two["질문"].iloc[-1]
        test_df["질문"][more_than_two.index[-2]] = more_than_two["질문"].iloc[-2]
        # for ind in range(1, len(more_than_two)):
        test_df.drop(index=more_than_two.index[:][-1], axis=0, inplace=True)
        questions.append(more_than_two["질문"].iloc[0])
        id_nums.append(j)

In [None]:
test_df = test_df.reset_index(drop=True)
def add_inst_3(df):
    for i in range(len(df)):
        answer = []
        keywords = []
        for j in df["keyword_match"][i]:
            answer.append(data["답변_3"][j])
            for k in range(len(data["키워드list"][j])):
                if len(data["키워드list"][j][k])>1:
                    keywords.append(data["키워드list"][j][k])
            # df["질문"][i] = data["답변_3"][j] + " " + df["질문"][i]
        answer = set(answer)
        answer = list(answer)
        keywords = set(keywords)
        keywords = list(keywords)
        if len(df["keyword_match"][i])==0:
            df["질문"][i] = " ".join(answer) + " ###질문:" + df["질문"][i]
        else:
            df["질문"][i] = " ".join(answer) + " " + df["질문"][i] + "\n###질문: " + df["질문"][i] + "\n###질문: " + df["질문"][i]
add_inst_3(test_df)

test_df.to_csv("./kkayo_nokey_24_03_10_drop_empty_keyword_splited_test_dab_3-add-three-sharp-q.csv")

# Inference

In [None]:
device = 'auto' 

########################################
# base_LLM_model = './custom_LLM_llama_weight' # 구글 드라이브를 통해 모델을 직접 다운받은 경우
base_LLM_model = 'Chaeseung/exp021' # 전장갓겜 허깅페이스 레포지토리에 저장된 모델을 쓰는 경우
########################################

test_df = pd.read_csv('drop_empty_keyword_splited_test_dab_3_1.csv')

pipe = pipeline('text-generation', model=base_LLM_model, device_map='auto')

def generate_pipeline(text):

    output = pipe(f'''
    당신은 실내 인테리어 전문 회사에 다니고 있는 친절한 전문가야. 서론을 제외하고 대답해줘.
    ### 질문: {text} 
    
    ### 응답:''', max_new_tokens=450, eos_token_id=2, return_full_text=False)
    return output[0]["generated_text"]

test_df['답변'] = test_df['질문'].progress_apply(generate_pipeline) 

In [None]:
def sentence_cut(sentence):
    split_questions_use_findall = re.findall('.+?[?!.]', sentence)
    if split_questions_use_findall[-1][-1] not in ['.', '!', '?', '다', '요']:
        split_questions_use_findall = split_questions_use_findall[:-1]
    return "".join(split_questions_use_findall)

test_df["답변"] = test_df["답변"].progress_apply(lambda x: sentence_cut(x))

connected_a = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_a.append(i)

id_nums = []
answers = []
for j in connected_a:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    for k in range(1, len(more_than_two)):
        more_than_two["답변"].iloc[0] += more_than_two["답변"].iloc[k]
        test_df["답변"][more_than_two.index[0]] = more_than_two["답변"].iloc[0]
    for ind in range(1, len(more_than_two)):
        test_df.drop(index=more_than_two.index[:][ind], axis=0, inplace=True)
    answers.append(more_than_two["답변"].iloc[0])
    id_nums.append(j)
    
preds = test_df['답변'].tolist()
model2 = SentenceTransformer('distiluse-base-multilingual-cased-v1')
pred_embeddings = model2.encode(preds)

submit = pd.read_csv("sample_submission.csv")
submit.iloc[:,1:] = pred_embeddings
llama_sub = submit
llama_sub_drop = llama_sub.drop(columns=["id"])

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
class CFG:
    ########################################
    # base_model='./custom_LLM_agiin_weight' # 구글 드라이브를 통해 모델을 직접 다운받은 경우
    base_model='Chaeseung/dobae_agiin' # 전장갓겜 허깅페이스 레포지토리에 저장된 모델을 쓰는 경우
    ########################################
    preprocessed_test_df = 'drop_empty_keyword_splited_test_dab_3_2.csv'
    max_new_tokens=450
    sentence_num = 4

In [None]:
device = 'auto' 
base_LLM_model = CFG.base_model
test_df = pd.read_csv(CFG.preprocessed_test_df)

pipe = pipeline('text-generation', model=base_LLM_model, device_map='auto')

def generate_pipeline(text):

    output = pipe(f'''
    ### 지침: 당신은 실내 인테리어 전문 회사에 다니고 있는 친절한 전문가야. 질문에 대답할 때는 질문에 관련된 내용에 대해서만 답변을 해줘. 답변은 최소 두 문장 이상을 해야해.
    ### 참고할 내용: {text} 

    ### 답변:''', max_new_tokens=CFG.max_new_tokens, eos_token_id=2, pad_token_id=0, return_full_text=False)
    return output[0]["generated_text"]

test_df['답변'] = test_df['질문'].progress_apply(generate_pipeline) 
test_df["답변"] = test_df["답변"].progress_apply(lambda x: sentence_cut(x))

In [None]:
connected_a = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_a.append(i)
        
id_nums = []
answers = []
for j in connected_a:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    for k in range(1, len(more_than_two)):
        more_than_two["답변"].iloc[0] += more_than_two["답변"].iloc[k]
        test_df["답변"][more_than_two.index[0]] = more_than_two["답변"].iloc[0]
    for ind in range(1, len(more_than_two)):
        test_df.drop(index=more_than_two.index[:][ind], axis=0, inplace=True)
    answers.append(more_than_two["답변"].iloc[0])
    id_nums.append(j)
    
preds = test_df['답변'].tolist()
model2 = SentenceTransformer('distiluse-base-multilingual-cased-v1')
pred_embeddings = model2.encode(preds)

submit = pd.read_csv("sample_submission.csv")
submit.iloc[:,1:] = pred_embeddings
agiin_sub_1 = submit
agiin_sub_1_drop = agiin_sub_1.drop(columns=["id"])

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
class CFG_2:
    ########################################
    # base_model='./custom_LLM_agiin_weight' # 구글 드라이브를 통해 모델을 직접 다운받은 경우
    base_model='Chaeseung/dobae_agiin' # 전장갓겜 허깅페이스 레포지토리에 저장된 모델을 쓰는 경우 
    ########################################
    preprocessed_test_df = 'kkayo_nokey_24_03_10_drop_empty_keyword_splited_test_dab_3-add-three-sharp-q.csv'
    max_new_tokens=450
    sentence_num = 4
    qna_csv = 'TEST_Output_QnA.csv'

In [None]:
device = 'auto' 
base_LLM_model = CFG_2.base_model
test_df = pd.read_csv(CFG_2.preprocessed_test_df)

pipe = pipeline('text-generation', model=base_LLM_model, device_map="auto")
def generate_pipeline(text):

    output = pipe(f'''
    ### 지침: 당신은 실내 인테리어 전문 회사에 다니고 있는 친절한 전문가야. 다음 내용에서 질문에 관련한 내용만 답변을 해줘. 답변은 최소 두 문장 이상을 해야해.
    ### 참고할 내용과 질문: {text} 
    
    ### 답변:''', max_new_tokens=CFG_2.max_new_tokens, eos_token_id=2, pad_token_id=0, return_full_text=False)
    return output[0]["generated_text"]

test_df['답변'] = test_df['질문'].progress_apply(generate_pipeline) 
test_df["답변"] = test_df["답변"].progress_apply(lambda x: sentence_cut(x))

In [None]:
connected_a = []
for i in range(len(test_df)):
    test_id = str(i).zfill(3)
    if (test_df['id'].value_counts().get(f"TEST_{test_id}", 0)>1):
        connected_a.append(i)
        
id_nums = []
answers = []
for j in connected_a:
    test_id = str(j).zfill(3)
    more_than_two = test_df[test_df["id"]==f"TEST_{test_id}"]
    for k in range(1, len(more_than_two)):
        more_than_two["답변"].iloc[0] += more_than_two["답변"].iloc[k]
        test_df["답변"][more_than_two.index[0]] = more_than_two["답변"].iloc[0]
    for ind in range(1, len(more_than_two)):
        test_df.drop(index=more_than_two.index[:][ind], axis=0, inplace=True)
    answers.append(more_than_two["답변"].iloc[0])
    id_nums.append(j)
    
preds = test_df['답변'].tolist()
test_df.to_csv(CFG_2.qna_csv)
model2 = SentenceTransformer('distiluse-base-multilingual-cased-v1')
pred_embeddings = model2.encode(preds)

submit = pd.read_csv("sample_submission.csv")
submit.iloc[:,1:] = pred_embeddings
agiin_sub_2 = submit
agiin_sub_2_drop = agiin_sub_2.drop(columns=["id"])

In [None]:
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
final_sub = (0.3*llama_sub_drop + 0.5*agiin_sub_1_drop + 0.2*agiin_sub_2_drop)
final_embeddings = pd.concat([agiin_sub_2["id"], final_sub], axis=1)
final_embeddings.head()

In [None]:
final_embeddings.to_csv("final_submission.csv", index=False)