In [None]:
!nvidia-smi

In [None]:
!head /proc/cpuinfo

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -U sentence-transformers

In [None]:
import random
import os
import re
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import torch
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer

def reset_seeds(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

SEED = 42
reset_seeds(SEED)
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

In [None]:
# 데이터 경로 지정
DATA_PATH = ''

# model load

In [None]:
from transformers import GPT2LMHeadModel

tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>',
                                                    eos_token='</s>',
                                                    unk_token='<unk>',
                                                    pad_token='<pad>',
                                                    mask_token='<mask>')

# inference function

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
embedding_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')

def cosine_similarity(a, b):
    dot_product = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot_product / (norm_a * norm_b) if norm_a != 0 and norm_b != 0 else 0

def cosine_score(input_text, generated_text):
    input_embedding = embedding_model.encode(input_text)
    generated_embedding = embedding_model.encode(generated_text)

    cosine_similarity_score = cosine_similarity(input_embedding, generated_embedding)
    cosine_similarity_score = max(cosine_similarity_score, 0)

    return cosine_similarity_score
def evaluate_similarity(input_text, generated_text, alpha=0.95): # cosine 가중치 조절
    input_embedding = embedding_model.encode(input_text)
    generated_embedding = embedding_model.encode(generated_text)

    # 코사인 유사도
    cosine_sim = 1 - pairwise_distances([input_embedding], [generated_embedding], metric='cosine')[0][0]

    # 자카드 유사도
    input_tokens = set(input_text.split())
    generated_tokens = set(generated_text.split())
    jaccard_sim = len(input_tokens.intersection(generated_tokens)) / len(input_tokens.union(generated_tokens))

    # 가중 평균 내보기
    weighted_sim = alpha*cosine_sim + (1 - alpha)*jaccard_sim

    return weighted_sim

In [None]:
def post_processing(text):
    import re
    end = text.rfind('</s>')
    if end:
        text = text[:end]
    text = text.strip()
    text = text.replace('\n', '')
    text = re.sub(' +', ' ', text)
    return text

# 실제 테스트

In [None]:
test = pd.read_csv(f'{DATA_PATH}test.csv')
test

In [None]:
def Chatbot(input_text, model, tokenizer=tokenizer, device=device,
            max_length=200, temperature=0.87, top_k=27, top_p=0.7, num_samples=5, generated=True):

    model.eval()
    input_text = input_text.strip().replace('"', '')
    text = '<q>' + input_text + '</s><a>'
    input_ids = tokenizer.encode(text, return_tensors='pt').to(device)
    q_len = len(text) + 1

    best_generated_text = None
    best_similarity_score = -1.0
    score_lst = []
    generated_texts = []
    for i in range(num_samples):
        result_ids = model.generate(input_ids,
                                    max_length=max_length,
                                    temperature=temperature,
                                    top_k=top_k,
                                    top_p=top_p,
                                    do_sample=True,
                                    num_return_sequences=1,
                                    )

        generated_text = tokenizer.decode(result_ids[0])
        generated_text = post_processing(generated_text[q_len:])

        similarity_score = evaluate_similarity(input_text, generated_text)
        score_lst.append(similarity_score)
        generated_texts.append((similarity_score,generated_text))
        if generated:
            print(generated_texts[i])

        if similarity_score > best_similarity_score:
            best_similarity_score = similarity_score
            best_generated_text = generated_text

    return best_generated_text, score_lst

In [None]:
model_name = 'kogpt2_120000_3epoch_0.21946loss'

model = GPT2LMHeadModel.from_pretrained(f'{DATA_PATH}model/{model_name}').to(device)
tokenizer = AutoTokenizer.from_pretrained("skt/kogpt2-base-v2",
                                                    bos_token='</s>',
                                                    eos_token='</s>',
                                                    unk_token='<unk>',
                                                    pad_token='<pad>',
                                                    mask_token='<mask>')

mean_score = []
answer_list = []
for _, row in tqdm(test.iterrows(), total=test.shape[0]):
    reset_seeds(SEED)
    text = row['질문']
    answer, score = Chatbot(text, model=model, tokenizer=tokenizer, generated=False, max_length=200, temperature=0.5, top_k=20, top_p=0.95)
    answer_list.append(answer)
    mean_score.append(score)

In [None]:
for a, b in zip(test['질문'].tolist(), answer_list) :
    print(f'질문 : {a}')
    print(f'답변 : {b}')
    print(' ')

In [None]:
test['답변'] = answer_list

test.to_csv(f'{DATA_PATH}inference_test.csv', index=False, encoding='utf-8-sig')