In [None]:
# File: nlp-final-evaluation.ipynb
# Author: Jack Lam
# Date: April 21, 2024
# Purpose: This notebook generates "result.txt" and evaluates models (Llama2 + BART).
# Usage: Please install the require packages. Please modify the loading dataset path.
# Run the required code one by one to generates "result.txt" and and evaluates models (Llama2 + BART). Please ensure to have the loading dataset and the path has modified correctly.

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
# Install libraries
!pip install ctransformers accelerate peft keybert jieba chinese-synonym-word



**Load Llama2 for RAG**

In [3]:
from ctransformers import AutoModelForCausalLM
from transformers import AutoTokenizer, pipeline

# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
# Model
model_llama = AutoModelForCausalLM.from_pretrained(
    "SinpxAI/Llama2-Chinese-7B-Chat-GGUF",
    model_file="llama2-chinese-7b-chat.Q4_K_M.gguf",
    model_type="llama",
    gpu_layers=110,
    hf=True
)

# Tokenizer
tokenizer_llama = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-chat-hf", use_fast=True)

2024-04-21 03:24:45.386131: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-21 03:24:45.386186: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-21 03:24:45.387665: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

ggml_cuda_set_main_device: using device 0 (Tesla T4) as main device


In [4]:
# Pipeline for Llama2
pipe_llama = pipeline(task="text-generation", model=model_llama, tokenizer=tokenizer_llama)

**Load Fine-tuned Bart Model & Tokenizer**

In [5]:
## Load Fine-tuned Bart Model & Tokenizer

from transformers import BartForConditionalGeneration, AutoTokenizer, pipeline
from peft import PeftModel
import torch

base_model = "fnlp/bart-base-chinese"
new_model = "tonyma163/bart_v1"

device="cuda:0"

base_model_reload = BartForConditionalGeneration.from_pretrained(
        base_model,
        return_dict=True,
        low_cpu_mem_usage=True,
        torch_dtype=torch.float16,
        device_map=device,
        #trust_remote_code=True,
)
base_model_reload.half()

model_bart = PeftModel.from_pretrained(base_model_reload, new_model)

In [6]:
## Load Fine-tuned Bart Model & Tokenizer

from transformers import BertTokenizer

tokenizer_bart = BertTokenizer.from_pretrained(base_model, trust_remote_code=True)

tokenizer_bart.pad_token = tokenizer_bart.eos_token
tokenizer_bart.padding_side = "right"

In [7]:
## Load Fine-tuned Bart Model & Tokenizer

from transformers import Text2TextGenerationPipeline

pipe_bart = Text2TextGenerationPipeline(model=model_bart, tokenizer=tokenizer_bart)

The model 'PeftModelForSeq2SeqLM' is not supported for . Supported models are ['BartForConditionalGeneration', 'BigBirdPegasusForConditionalGeneration', 'BlenderbotForConditionalGeneration', 'BlenderbotSmallForConditionalGeneration', 'EncoderDecoderModel', 'FSMTForConditionalGeneration', 'GPTSanJapaneseForConditionalGeneration', 'LEDForConditionalGeneration', 'LongT5ForConditionalGeneration', 'M2M100ForConditionalGeneration', 'MarianMTModel', 'MBartForConditionalGeneration', 'MT5ForConditionalGeneration', 'MvpForConditionalGeneration', 'NllbMoeForConditionalGeneration', 'PegasusForConditionalGeneration', 'PegasusXForConditionalGeneration', 'PLBartForConditionalGeneration', 'ProphetNetForConditionalGeneration', 'SeamlessM4TForTextToText', 'SeamlessM4Tv2ForTextToText', 'SwitchTransformersForConditionalGeneration', 'T5ForConditionalGeneration', 'UMT5ForConditionalGeneration', 'XLMProphetNetForConditionalGeneration'].


**Load Knowledge Set**

In [8]:
## Load Knowledge Set

import pandas as pd
import ast

file_path = "/kaggle/input/nlp-knowledgeset/knowledge_set.txt"

data = []

# Open the file and parse each line from string to tuple
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        if line.strip():  # Ensure the line is not empty
            try:
                # Convert string representation of tuple to actual tuple
                tuple_data = ast.literal_eval(line.strip())
                data.append(tuple_data)
            except SyntaxError:
                print(f"Skipping malformed line: {line.strip()}")

# Load the data into a DataFrame
df = pd.DataFrame(data, columns=['Entity', 'Category', 'Answer'])

**Clean Knowledge Set**

In [9]:
## Check Category Data
distinct_categories = df['Category'].unique()
distinct_categories

array(['2018-11-14', '喜好', '评论', '特色菜', '简介', '适合听', '评分', '人均价格', '喜欢',
       '适合吃', '2018-10-13', '主演', '2018-10-16', '演唱', '2018-10-15',
       '2018-12-25', '2018-12-27', '类型', '获奖', '成就', '嘿玛口碑', '(live)评论',
       '身高', '2018-11-12', '无持续风向,最高气温:12℃,最低气温:-4℃适合吃', '2018-12-24',
       '2018-11-19', '订单量', '导演', '2018-11-2', '喜欢的新闻',
       '东风,最高气温:13℃,最低气温:6℃适合吃', '东北风,最高气温:6℃,最低气温:0℃适合吃',
       '西南风,最高气温:4℃,最低气温:-5℃适合吃', '人间道评论', '2018-10-26', '新闻', '口碑',
       '东南风,最高气温:5℃,最低气温:-3℃适合吃', '星座', '2018-11-11', '2018-12-21',
       '2018-10-11', '国家地区', '2018-12-12', '2018-10-23', '2018-11-8',
       '东南风,最高气温:11℃,最低气温:1℃适合吃', '2018-10-1', '情绪零碎评论', '成分', '我的姓氏评论',
       '3040评论', '地址', '2018-12-26', '嘿玛导演', '无持续风向,最高气温:19℃,最低气温:4℃适合吃',
       '东南风,最高气温:16℃,最低气温:11℃适合吃', '西北风,最高气温:8℃,最低气温:0℃适合吃', '2018-10-12',
       '2018-12-17', '时间', '2018-10-4', '道道道评论', '(00Live)(live版)评论',
       '无持续风向,最高气温:15℃,最低气温:11℃适合吃', '2018-11-1',
       '无持续风向,最高气温:13℃,最低气温:5℃适合吃', '2018-12-16', '

In [12]:
## Data Clean and proprocess on the Entity and Category

def categoryClean(df , targetCategory):
        
        targetCategoryLen = len(targetCategory)
        #print("\n Target Category Length " + targetCategory + " :" , targetCategoryLen)
        #print("------------------------------------------------------------------------------------------------------------------------")
        
        # Count the "Category" value containing "targetCategory"
        comment_count = df[df['Category']== targetCategory].shape[0]
        #print("\nNumber of rows where 'Category' contains " + targetCategory + " :", comment_count)
        
        #print("------------------------------------------------------------------------------------------------------------------------")
        filtered_data = df[df['Category'].str.contains(targetCategory) & (df['Category'].str.len() > targetCategoryLen)]
        #print(filtered_data)
        
        #print("------------------------------------------------------------------------------------------------------------------------")
        targetRow_ids = df[df['Category'].str.contains(targetCategory) & (df['Category'].str.len() > targetCategoryLen)].index
        #print(targetRow_ids)
        
        #print("------------------------------------------------------------------------------------------------------------------------")
        distinct_categories = df[df['Category'].str.contains(targetCategory)]['Category'].unique()
        #print("Distinct categories containing " + targetCategory + " :", distinct_categories)
        
        ## update 
        df.loc[df['Category'].str.contains(targetCategory), 'Entity'] = df['Entity'] + df['Category'].str.split(targetCategory).str[0]
        df['Category'] = df['Category'].apply(lambda x: targetCategory if targetCategory in x else x)
        
        # Count the "Category" value containing "targetCategory"
        comment_count = df[df['Category'] == targetCategory].shape[0]
        #print("\nNumber of rows where 'Category' contains " + targetCategory + " :", comment_count)
        
        #print("------------------------------------------------------------------------------------------------------------------------")
        filtered_data = df[df['Category'].str.contains(targetCategory) & (df['Category'].str.len() > targetCategoryLen)]
        #print(filtered_data)
        
        #print("------------------------------------------------------------------------------------------------------------------------")
        distinct_categories = df[df['Category'].str.contains(targetCategory)]['Category'].unique()
        #print("Distinct categories containing " + targetCategory + " :", distinct_categories)
        
        return df

In [13]:
## Data Clean and proprocess on the Category

cateList = ['评论', '主演','口碑','导演','类型','评分','国家地区','适合吃','特色菜','人均价格', '订单量','地址',]

for cat in cateList:
    df = categoryClean(df, cat)

In [14]:
## Check Category is cleaned
distinct_categories = df['Category'].unique()
distinct_categories

array(['2018-11-14', '喜好', '评论', '特色菜', '简介', '适合听', '评分', '人均价格', '喜欢',
       '适合吃', '2018-10-13', '主演', '2018-10-16', '演唱', '2018-10-15',
       '2018-12-25', '2018-12-27', '类型', '获奖', '成就', '口碑', '身高',
       '2018-11-12', '2018-12-24', '2018-11-19', '订单量', '导演', '2018-11-2',
       '喜欢的新闻', '2018-10-26', '新闻', '星座', '2018-11-11', '2018-12-21',
       '2018-10-11', '国家地区', '2018-12-12', '2018-10-23', '2018-11-8',
       '2018-10-1', '成分', '地址', '2018-12-26', '2018-10-12', '2018-12-17',
       '时间', '2018-10-4', '2018-11-1', '2018-12-16', '2018-12-7',
       '2018-12-3', '2018-12-13', '血型', '2018-12-6', '2018-11-25',
       '2018-11-26', '2018-12-14', '2018-12-9', '2018-12-19', '2018-10-8',
       '2018-10-27', '2018-10-24', '2018-12-8', '2018-12-2', '2018-11-5',
       '2018-10-19', '体重', '2018-12-15', '2018-10-25', '2018-10-21',
       '2018-10-5', '2018-11-18', '2018-10-20', '2018-10-18',
       '2018-12-23', '2018-10-3', '2018-10-9', '2018-10-6', '2018-11-23',
       '2018-11-3'

**Knowledge Tree**

In [15]:
## Construct Knowledge Tree Function

import networkx as nx
import pandas as pd

# Assuming 'df' is your DataFrame containing the data
G = nx.DiGraph()  # Directed graph can still function as a tree

# Add nodes and edges based on the DataFrame
for index, row in df.iterrows():
    entity_id = f"Entity: {row['Entity']}"
    category_id = f"Category: {row['Category']} ({row['Entity']})"
    answer_id = f"Answer: {row['Answer']} ({row['Category']})"

    # Ensure that nodes for each level (entity, category) are unique per entity-category pair
    if entity_id not in G:
        G.add_node(entity_id, type='Entity', name=row['Entity'])
    if category_id not in G:
        G.add_node(category_id, type='Category', name=row['Category'])
    
    # Answers can be multiple per category, so they are always added
    G.add_node(answer_id, type='Answer', content=row['Answer'])
    
    # Connect nodes hierarchically
    G.add_edge(entity_id, category_id)
    G.add_edge(category_id, answer_id)

**Part of Speech Function**

In [16]:
## Extract the special keyword in brackets, which will not be processed by the extractPOS function.

def checkSentence(sentence):
    separated_sentences = []
    temp_sentence = ""
    in_bracket = False

    for char in sentence:
        if char == "『" or char == "《":
            in_bracket = True
            if temp_sentence:
                separated_sentences.append(temp_sentence)
                temp_sentence = ""
            temp_sentence += char
        elif char == "』" or char == "》":
            in_bracket = False
            temp_sentence += char
            separated_sentences.append(temp_sentence)
            temp_sentence = ""
        elif char == "。" and not in_bracket:
            separated_sentences.append(temp_sentence)
            temp_sentence = ""
        else:
            temp_sentence += char

    if temp_sentence:
        separated_sentences.append(temp_sentence)

    return separated_sentences

In [17]:
## For knowledge Tree searching, extract part of the speech from the sentence.

import jieba.posseg as pseg

def extractPOS(sentences):

    result = []
    special_words = []

    for sentence in sentences:
        if '『' in sentence or '《' in sentence:
            
            # Remove the brackets
            sentence = sentence.replace('『', '@').replace('』', '@').replace('《', '@').replace('》', '@')
            
            #result.append(sentence)
            special_words.append(sentence)
        else:
            allowSentencePOS = ("a", "ad", "ag", "an", ## 形容词 
                                "b", # 区别词
                                "c", # 连词
                                "f", 
                                "g", 
                                "h",
                                "i", 
                                "j", 
                                "k", 
                                "l", 
                                "m", "mg", "mq", 
                                "n", "ng", "nr", "nrfg", "nrt","ns", "nt", "nz", 
                                "o", 
                                "q", 
                                "s", 
                                "t", "tg",
                                "v", "vd", "vg", "vi", "vn","vq" ## 动词
                                )
            
            """"
            ## This part POS will not be extracted
            exclude = ( "p", ## 介词
                "d", "df", "dg", # 副词
                "e", # 叹词 
                "r", "rg", "rr", "rz",  ## 代词
                "u",  "ud", "ug", "uj", "ul", "uv", "uz",  ## 助词
                "x",  ## 非语素词（包含标点符号
                "y", ## 语气词
                "z","zg" ## 助词
              )
            """
            
            sentence = pseg.lcut(sentence)
            seg_words = [word for word, pos in sentence if pos in allowSentencePOS]
            result.extend(seg_words)
            
    return special_words+result

**Retrieve Data from Knowledge Tree**

In [18]:
## Retrieve Data from Knowledge Tree

def has_common_character(keyword, category):
    return any(char in category for char in keyword)

In [19]:
## Knowledge Tree Search function

import jieba
import random

def retrieve_answers(graph, query):
    segmented_keywords = extractPOS(checkSentence(query))
    special_keywords = [kw.strip('@') for kw in segmented_keywords if kw.startswith('@')]
    regular_keywords = [kw for kw in segmented_keywords if not kw.startswith('@')]

    category_answers = {}  # Store answers grouped by categories
    special_categories = {'新闻', '评论'}

    # Top-Down Search: From entities to categories to answers
    for entity_node in (n for n in graph.nodes if graph.nodes[n]['type'] == 'Entity'):
        entity_keywords = [kw for kw in regular_keywords if kw == graph.nodes[entity_node]['name']]
        if entity_keywords:
            for category_node in graph.successors(entity_node):
                category_keywords = [kw for kw in regular_keywords if has_common_character(kw, graph.nodes[category_node]['name'])]
                if category_keywords:
                    for answer_node in graph.successors(category_node):
                        category_name = graph.nodes[category_node]['name']
                        answer_content = graph.nodes[answer_node]['content']
                        if category_name not in category_answers:
                            category_answers[category_name] = []
                        if category_name not in special_categories:
                            category_answers[category_name].append(category_name+answer_content)
                        else:
                            category_answers[category_name].append(answer_content)

    # Bottom-Up Search: From answers to categories to entities
    for answer_node in (n for n in graph.nodes if graph.nodes[n].get('type') == 'Answer'):
        if any(kw == graph.nodes[answer_node]['content'] for kw in special_keywords):
            for category_node in graph.predecessors(answer_node):
                category_name = graph.nodes[category_node]['name']
                if any(has_common_character(kw, category_name) for kw in regular_keywords):
                    for entity_node in graph.predecessors(category_node):
                        if category_name not in category_answers:
                            category_answers[category_name] = []
                        category_answers[category_name].append(graph.nodes[entity_node]['name'] + category_name)

    final_answers = []
    require_llama = False
    for category, answers in category_answers.items():
        if category in special_categories:
            # Randomly pick one answer if category is special
            final_answers.append(random.choice(answers))
        else:
            # Collect all answers for other categories
            final_answers.extend(answers)
            require_llama = True  # Indicate further processing may be needed for non-special categories

    return final_answers, require_llama

In [20]:
## Function Testing
input_query = "你知道张柏芝的生日是什么时候吗？"
answers = retrieve_answers(G, input_query)
print(f"Question: {input_query} Response: {answers}")

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 1.021 seconds.
Prefix dict has been built successfully.


Question: 你知道张柏芝的生日是什么时候吗？ Response: (['生日1980-5-24', '出生地中国香港'], True)


**Llama2(Knowledge Tree) + Fine-tuned Bart**

In [21]:
## System Prompt for Llama2

system_prompt = """
Please answer the following question based on the provided context.
Provide only the direct answers without any additional explanations or context.
Please only output answers only.
Please ensure that your responses are socially unbiased and positive in nature.
If a question does not make any sense or is not factually coherent, explain why instead of answering something not correct.
If you don't know the answer, please don't share false information.
"""

In [22]:
## Llama2 Query Function

def query_llama(query, context):
    # Test
    prompt = f"""
    <s>[INST] <<SYS>>
    {system_prompt}
     <</SYS>> [/INST]</s>

    <s>[INST]
    Question: {query}
    Context: {context}
    Answer:
    [/INST]
        """

    output = pipe_llama(
        prompt,
        do_sample=True,
        max_new_tokens=256,
        top_k=40,
        top_p=0.95,
        temperature=0.75,
        return_full_text=False,
        repetition_penalty=1
    )
    return output[0]['generated_text']

In [23]:
## Enquiry on Llama2 or BART

def query_system(graph, query):
    # Attempt to retrieve answers from the knowledge graph
    answers, require_llama = retrieve_answers(graph, query)
    
    # If answers are found in the graph, return
    if answers and require_llama:
        # pass to llama
        return "Answer from knowledge graph:", query_llama(query, answers)
    elif answers and not require_llama:
        # return directly
        return "Answer from knowledge graph:", answers
    # If no answers are found, -> language model
    else:
        generated_answer = pipe_bart(query)
        return "Answer from language model:", generated_answer[0]['generated_text']

In [24]:
## Function Testing
query = "帮我点一首《想你》吧"
result = query_system(G, query)
print(result)



('Answer from language model:', '正 在 为 你 播 放 《 想 你 》')


In [25]:
## Function Testing
query = "张国荣哪里出生的？"
result = query_system(G, query)
print(result)

('Answer from knowledge graph:', '张国荣出生地是香港九龙。')


In [26]:
## Function Testing
query = "说说张学友的新闻"
result = query_system(G, query)
print(result)

('Answer from knowledge graph:', ['7月8日，河南洛阳民警在张学友演唱会现场附近执勤时，发现一车辆存在交通违法，遂上前了解情况。经查，驾车男子提供的个人信息与其本人不符，身份可疑。经进一步核实，原来该男子从563公里远的武汉赶到洛阳，就是为看张学友演唱会。目前，该男子对冒用他人信息，无证驾驶行为供认不讳，案件正在进一步调查中。据悉，洛阳演唱会现场蜀黍还抓了一波黄牛党，正在核实中。'])


**Import test.txt**

In [27]:
## import test.txt Dataset
from datasets import Dataset, load_dataset

dataset = load_dataset(
    'json',
    data_files={
        'test':
        '/kaggle/input/durecdial/test.txt',
    }
)

**Generate result.txt**

In [28]:
## Preprocess test.txt Dataset

conversation_list = dataset['test']['conversation']
conversation_list

# Format each turn in the context
formatted_context = []

for conversations in conversation_list:
    
    ## Label the last user inquiry for Llama2(Knowledge Tree) inquiry approach
    conversations[-1] = conversations[-1].split(':')[0] + ':' + '##@' + conversations[-1].split(':')[1] + '@##'
    
    ## Formatted the whole data for BART inquiry approach
    formatted_conversation = ''.join(conv.replace('user:', '\nInstruction:\n').replace('bot:','\nResponse:\n').replace(' ','') for conv in conversations)
    
    ## Append into a list 
    formatted_context.append(formatted_conversation.lstrip('\n'))

In [29]:
## This Function is the backend response API to generate response to use.

import re

def generate_text(user_input):
    # Retrieve the latest user input
    match = re.search(r'##@(.*?)@##', user_input)
    
    if match:
        # Extracts text found between the markers
        query = match.group(1)
        full_input_without_markers = re.sub(r'##@.*?@##', query, user_input)
    else:
        query = user_input
        full_input_without_markers = user_input
    
    # Attempt to retrieve from the knowledge set for Llama2
    answers, require_llama = retrieve_answers(G, query)

    # If answers are found in the graph, and further processing with Llama2 is required
    if answers and require_llama:
        return query_llama(query, answers)
    elif answers and not require_llama:
        return answers
    else:
        # If no answers are found, use BART model to generate an answer from the full context without markers
        inputs = tokenizer_bart.encode(full_input_without_markers, return_tensors="pt")
        outputs = model_bart.generate(input_ids=inputs.to(device), max_new_tokens=126)
        response_text = tokenizer_bart.decode(outputs[0], skip_special_tokens=True)
        return response_text

In [None]:
## Generate Response on the test.txt dataset
total_result = []

for con in formatted_context:
    con_result = generate_text(con)
    total_result.append(con_result)

In [None]:
# Create a new list to store the results
# total_result contains list elements, needs to convert into string for Evaluation

modified_total_result = []

# Iterate through candidate_result and convert inner lists to strings
for item in total_result:
    if isinstance(item, list):
        modified_total_result.append(' '.join(item).replace(' ',''))
    else:
        modified_total_result.append(item.replace(' ',''))

In [None]:
# Create a DataFrame to store prediction results for each record.
df_response = pd.DataFrame(modified_total_result, columns=['Response'])

## Generate result.txt for submission
df_response['Response'].to_csv('result.txt', index=False, header=False)

In [None]:
"""
## Result Checking
df_formatted_context = pd.DataFrame(formatted_context)
df_total_result = pd.DataFrame(total_result)

merged_df = pd.concat([df_formatted_context, df_total_result], axis=1)
merged_df.to_csv('result_combine.csv', index=False, header=False)
"""

**Evaluation**

In [30]:
## Get test
conversation_list = dataset['test']['conversation']
conversation_list

[['user: 周迅的星座是什么？',
  'bot: 周迅是天秤座。',
  'user: 你真棒！知道的真多。',
  'bot: 我还知道周迅可是亚洲电影大奖最佳女主角呢！',
  'user: 周迅可是我的偶像，她的演技超好。',
  'bot: 她是真正的为演艺而生，而且还获得过华语电影传媒大奖最佳女演员。',
  'user: 我非常喜欢她演的电影。'],
 ['user: 能告我一下黄渤的生日么？',
  'bot: 好的，1974年8月26日。',
  'user: 好的，谢谢。',
  'bot: 看你挺喜欢黄渤的，告诉你哦，他曾经获得过中国电影表演艺术学会金凤凰奖。',
  'user: 果然是我喜欢的明星。',
  'bot: 好呀，北京大学生电影节最佳男演员，中国电影导演协会年度男演员。',
  'user: 你知道的不少呀。',
  'bot: 那当然，既然说到你的偶像，那为你推荐这部他主演的《疯狂的赛车》，电影对观众情绪的调动，对笑料包袱的运用，对影迷智商和注意力的挑战都是异常出色的，即有趣又高端。',
  'user: 看来这个电影不错哦，喜剧片吗？',
  'bot: 对，喜剧动作片。',
  'user: 好的，那我就看看吧。',
  'bot: 相信我你不会失望的。',
  'user: 我还有事。先忙去啦。'],
 ['user: 你好！今天天气怎么样呢？',
  'bot: 多云,无持续风向,最高气温:24℃,最低气温:20℃',
  'user: 好的，谢谢你告诉我',
  'bot: 不客气呢，今天的天气适合吃酸菜鱼哦',
  'user: 酸菜鱼是我最喜欢',
  'bot: 鱼是很有营养的呢，多吃点比较好',
  'user: 正好呀，可以暖暖身子',
  'bot: 推荐你去《品蜀国精品川菜（科学城店）》，酸菜鱼可是他们店的特色菜',
  'user: 好的呀，人均价格是多少呢？',
  'bot: 58元，不算特别高',
  'user: 地址在哪？',
  'bot: 黄埔区科学城科学大道193号高德汇购物中心3楼',
  'user: 评分怎么样呢？',
  'bot: 4.2呢',
  'user: 好的，明天中午12点2个人，帮我预定一下。'],
 ['user: 今天几号了',
  'bot: 5月24

In [31]:
## Evaluate dataset, which is a conversation before the last bot response
eval_test_set = []

## Evaluate dataset reference, which is the last bot response in a conversation
eval_reference = []

In [32]:
## Data Preprocessing for spliting evaluation conversation and reference
for conversations in conversation_list:
    eval_data = conversations[:-2]
    reference = conversations[-2]
    
    ## Label the last user inquiry for Llama2(Knowledge Tree) inquiry approach
    eval_data[-1] = eval_data[-1].split(':')[0] + ':' + '##@' + eval_data[-1].split(':')[1] + '@##'
    
    ## Formatted the whole data for BART inquiry approach
    formatted_eval_data = ''.join(eval_data_conv.replace('user:', '\nInstruction:\n').replace('bot:','\nResponse:\n').replace(' ','') for eval_data_conv in eval_data)
    
    ## Append into eval_test_set list 
    eval_test_set.append(formatted_eval_data.lstrip('\n'))
    
    ## Append into eval_reference list 
    eval_reference.append(reference)

In [33]:
## Check that all references are responded to by "bot"
sum_bot_res = 0

for reference in eval_reference:
    if reference.__contains__('bot: '):
        sum_bot_res = sum_bot_res + 1
    
print('bot count' ,sum_bot_res)

bot count 2626


In [34]:
## Remove irrelevant string in eval_reference
eval_reference = [s.replace("bot: ", "").replace(" ", "") for s in eval_reference]

In [37]:
## Generate candidate on the eval_test_set(test.txt)
candidate_result = []

for con in eval_test_set:
    con_result = generate_text(con)
    candidate_result.append(con_result)

In [195]:
# Create a new list to store the modified candidate results
# candidate_result contains list element, needs to convert into string for Evaluation

modified_candidate_result = []

# Iterate through candidate_result and convert inner lists to strings
for item in candidate_result:
    if isinstance(item, list):
        modified_candidate_result.append(' '.join(item).replace(' ',''))
    else:
        modified_candidate_result.append(item.replace(' ',''))

In [196]:
## Store into dataframe
df_eval_reference = pd.DataFrame(eval_reference, columns=['Reference'])

## Store into dataframe
df_eval_candidate = pd.DataFrame(modified_candidate_result, columns=['Candidate'])

## Generate Candidate csv
df_eval_candidate['Candidate'].to_csv('eval_candidate.txt', index=False, header=False)

In [197]:
# Calculate BLEU individual scores
bleu_individual_1_gram = [sentence_bleu(ref, cand, weights=(1, 0, 0, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_individual_2_gram = [sentence_bleu(ref, cand, weights=(0, 1, 0, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_individual_3_gram = [sentence_bleu(ref, cand, weights=(0, 0, 1, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_individual_4_gram = [sentence_bleu(ref, cand, weights=(0, 0, 0, 1)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]

In [199]:
## Get bleu individual avg for all record
bleu_individual_1_gram_avg = sum(bleu_individual_1_gram) / len(bleu_individual_1_gram)
bleu_individual_2_gram_avg = sum(bleu_individual_2_gram) / len(bleu_individual_2_gram)
bleu_individual_3_gram_avg = sum(bleu_individual_3_gram) / len(bleu_individual_3_gram)
bleu_individual_4_gram_avg = sum(bleu_individual_4_gram) / len(bleu_individual_4_gram)

# Print bleu reslut
print("bleu_individual_1_gram_avg: ", bleu_individual_1_gram_avg)
print("bleu_individual_2_gram_avg: ", bleu_individual_2_gram_avg)
print("bleu_individual_3_gram_avg: ", bleu_individual_3_gram_avg)
print("bleu_individual_4_gram_avg: ", bleu_individual_4_gram_avg)

bleu_individual_1_gram_avg:  0.32041782779910594
bleu_individual_2_gram_avg:  0.9345011424219345
bleu_individual_3_gram_avg:  0.9345011424219345
bleu_individual_4_gram_avg:  0.9345011424219345


In [200]:
# bleu individual avge result
bleu_individual_avg = (bleu_individual_1_gram_avg+bleu_individual_2_gram_avg+bleu_individual_3_gram_avg+bleu_individual_4_gram_avg) / 4
print("bleu_individual_avg: ", bleu_individual_avg)

bleu_individual_avg:  0.7809803137662275


In [201]:
# Calculate BLEU cumulative scores
bleu_cumulative_1_gram = [sentence_bleu(ref, cand, weights=(1, 0, 0, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_cumulative_2_gram = [sentence_bleu(ref, cand, weights=(0.5, 0.5, 0, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_cumulative_3_gram = [sentence_bleu(ref, cand, weights=(0.33, 0.33, 0.33, 0)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
bleu_cumulative_4_gram = [sentence_bleu(ref, cand, weights=(0.25, 0.25, 0.25, 0.25)) for ref, cand in  zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]

In [202]:
## Get bleu avg for all record
bleu_cumulative_1_gram_avg = sum(bleu_cumulative_1_gram) / len(bleu_cumulative_1_gram)
bleu_cumulative_2_gram_avg = sum(bleu_cumulative_2_gram) / len(bleu_cumulative_2_gram)
bleu_cumulative_3_gram_avg = sum(bleu_cumulative_3_gram) / len(bleu_cumulative_3_gram)
bleu_cumulative_4_gram_avg = sum(bleu_cumulative_4_gram) / len(bleu_cumulative_4_gram)

# Print bleu reslut
print("bleu_cumulative_1_gram_avg: ", bleu_cumulative_1_gram_avg)
print("bleu_cumulative_2_gram_avg: ", bleu_cumulative_2_gram_avg)
print("bleu_cumulative_3_gram_avg: ", bleu_cumulative_3_gram_avg)
print("bleu_cumulative_4_gram_avg: ", bleu_cumulative_4_gram_avg)

bleu_cumulative_1_gram_avg:  0.32041782779910594
bleu_cumulative_2_gram_avg:  0.5095262131966574
bleu_cumulative_3_gram_avg:  0.614697667159339
bleu_cumulative_4_gram_avg:  0.6756516714181057


In [203]:
# bleu avge result
bleu_avg = (bleu_cumulative_1_gram_avg+bleu_cumulative_2_gram_avg+bleu_cumulative_3_gram_avg+bleu_cumulative_4_gram_avg) / 4
print("bleu_avg: ", bleu_avg)

bleu_avg:  0.530073344893302


In [219]:
## Evaluation using rouge
!pip install rouge-chinese

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge-chinese
  Downloading rouge_chinese-1.0.3-py3-none-any.whl.metadata (7.6 kB)
Downloading rouge_chinese-1.0.3-py3-none-any.whl (21 kB)
Installing collected packages: rouge-chinese
Successfully installed rouge-chinese-1.0.3


In [298]:
## Split by char

from rouge import Rouge

rouge = Rouge()

## rouge.get_scores(hyps=' '.join(df_eval_candidate['Candidate'][3]) , refs=' '.join(df_eval_reference['Reference'][3]) )

# Calculate rouge F1 scores
rouge_1_f1 = [rouge.get_scores(hyps=' '.join(cand) , refs=' '.join(ref) )[0]['rouge-1']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
rouge_2_f1 = [rouge.get_scores(hyps=' '.join(cand) , refs=' '.join(ref) )[0]['rouge-2']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
rouge_l_f1 = [rouge.get_scores(hyps=' '.join(cand) , refs=' '.join(ref) )[0]['rouge-l']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]

## Get rouge F1  avg for all record
rouge_1_f1_avg = sum(rouge_1_f1) / len(rouge_1_f1)
rouge_2_f1_avg = sum(rouge_2_f1) / len(rouge_2_f1)
rouge_l_f1_avg = sum(rouge_l_f1) / len(rouge_l_f1)

# Print rouge F1  reslut
print("rouge_1_f1_avg: ", rouge_1_f1_avg)
print("rouge_2_f1_avg: ", rouge_2_f1_avg)
print("rouge_l_f1_avg: ", rouge_l_f1_avg)

rouge_1_f1_avg:  0.30251087144514777
rouge_2_f1_avg:  0.14826409383097866
rouge_l_f1_avg:  0.27428766825521295


In [299]:
## Split by word

from rouge import Rouge
import jieba

rouge = Rouge()

## rouge.get_scores(hyps=' '.join(df_eval_candidate['Candidate'][3]) , refs=' '.join(df_eval_reference['Reference'][3]) )

# Calculate rouge F1 scores
rouge_1_word_f1 = [rouge.get_scores(hyps=' '.join(jieba.cut(cand, HMM=False)) , refs=' '.join(jieba.cut(ref, HMM=False)) )[0]['rouge-1']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
rouge_2_word_f1 = [rouge.get_scores(hyps=' '.join(jieba.cut(cand, HMM=False)) , refs=' '.join(jieba.cut(ref, HMM=False)) )[0]['rouge-2']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]
rouge_l_word_f1 = [rouge.get_scores(hyps=' '.join(jieba.cut(cand, HMM=False)) , refs=' '.join(jieba.cut(ref, HMM=False)) )[0]['rouge-l']['f'] for ref, cand in zip(df_eval_reference['Reference'], df_eval_candidate['Candidate'] )]

         
## Get rouge F1  avg for all record
rouge_1_word_f1_avg = sum(rouge_1_word_f1) / len(rouge_1_word_f1)
rouge_2_word_f1_avg = sum(rouge_2_word_f1) / len(rouge_2_word_f1)
rouge_l_word_f1_avg = sum(rouge_l_word_f1) / len(rouge_l_word_f1)

# Print rouge F1  reslut
print("rouge_1_word_f1_avg: ", rouge_1_word_f1_avg)
print("rouge_2_word_f1_avg: ", rouge_2_word_f1_avg)
print("rouge_l_word_f1_avg: ", rouge_l_word_f1_avg)

rouge_1_word_f1_avg:  0.2735797823796772
rouge_2_word_f1_avg:  0.10356798766268521
rouge_l_word_f1_avg:  0.25414763256277284


**## ROUGE and BLEU Metric**

In [311]:
# Print bleu reslut
print("bleu_individual_1_gram_avg: ", bleu_individual_1_gram_avg)
print("bleu_individual_2_gram_avg: ", bleu_individual_2_gram_avg)
print("bleu_individual_3_gram_avg: ", bleu_individual_3_gram_avg)
print("bleu_individual_4_gram_avg: ", bleu_individual_4_gram_avg)

# Print bleu reslut
print("bleu_cumulative_1_gram_avg: ", bleu_cumulative_1_gram_avg)
print("bleu_cumulative_2_gram_avg: ", bleu_cumulative_2_gram_avg)
print("bleu_cumulative_3_gram_avg: ", bleu_cumulative_3_gram_avg)
print("bleu_cumulative_4_gram_avg: ", bleu_cumulative_4_gram_avg)

# Print bleu reslut
print("rouge_1_f1_avg (Char): ", rouge_1_f1_avg)
print("rouge_2_f1_avg (Char): ", rouge_2_f1_avg)
print("rouge_l_f1_avg (Char): ", rouge_l_f1_avg)

# Print bleu reslut
print("rouge_1_word_f1_avg: ", rouge_1_word_f1_avg)
print("rouge_2_word_f1_avg: ", rouge_2_word_f1_avg)
print("rouge_l_word_f1_avg: ", rouge_l_word_f1_avg)

bleu_individual_1_gram_avg:  0.32041782779910594
bleu_individual_2_gram_avg:  0.9345011424219345
bleu_individual_3_gram_avg:  0.9345011424219345
bleu_individual_4_gram_avg:  0.9345011424219345
bleu_cumulative_1_gram_avg:  0.32041782779910594
bleu_cumulative_2_gram_avg:  0.5095262131966574
bleu_cumulative_3_gram_avg:  0.614697667159339
bleu_cumulative_4_gram_avg:  0.6756516714181057
rouge_1_f1_avg (Char):  0.30251087144514777
rouge_2_f1_avg (Char):  0.14826409383097866
rouge_l_f1_avg (Char):  0.27428766825521295
rouge_1_word_f1_avg:  0.2735797823796772
rouge_2_word_f1_avg:  0.10356798766268521
rouge_l_word_f1_avg:  0.25414763256277284


In [315]:
# Create a dictionary with the results and output eval_Llama2_BART.csv
eval_results = {
    "Metric": ["bleu_individual_1_gram_avg", "bleu_individual_2_gram_avg", "bleu_individual_3_gram_avg", "bleu_individual_4_gram_avg",
               "bleu_cumulative_1_gram_avg", "bleu_cumulative_2_gram_avg", "bleu_cumulative_3_gram_avg", "bleu_cumulative_4_gram_avg",
               "rouge_1_f1_avg (Char)", "rouge_2_f1_avg (Char)", "rouge_l_f1_avg (Char)",
               "rouge_1_word_f1_avg", "rouge_2_word_f1_avg", "rouge_l_word_f1_avg"],
    
    "Value": [bleu_individual_1_gram_avg, bleu_individual_2_gram_avg, bleu_individual_3_gram_avg, bleu_individual_4_gram_avg,
              bleu_cumulative_1_gram_avg, bleu_cumulative_2_gram_avg, bleu_cumulative_3_gram_avg, bleu_cumulative_4_gram_avg,
              rouge_1_f1_avg, rouge_2_f1_avg, rouge_l_f1_avg,
              rouge_1_word_f1_avg, rouge_2_word_f1_avg, rouge_l_word_f1_avg]
}


eval_df = pd.DataFrame([eval_results['Value']], columns=eval_results['Metric'])

eval_df.to_csv('eval_Llama2_BART.csv', index=False, header=True)
