### save models

In [1]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("Writer/camel-5b-hf")
model = AutoModel.from_pretrained("Writer/camel-5b-hf")

tokenizer.save_pretrained("../models/camel-5b-hf")
model.save_pretrained("../models/camel-5b-hf")


tokenizer = AutoTokenizer.from_pretrained("BAAI/bge-small-en-v1.5")
model = AutoModel.from_pretrained("BAAI/bge-small-en-v1.5")


tokenizer.save_pretrained("../models/bge-small-en-v1.5")
model.save_pretrained("../models/bge-small-en-v1.5")

# load model: LlamaIndex

In [28]:
import torch
from llama_index.llms.huggingface import HuggingFaceLLM

llm = HuggingFaceLLM(
    tokenizer_name="../models/camel-5b-hf",
    model_name="../models/camel-5b-hf",
    context_window=2048,
    max_new_tokens=512,
    generate_kwargs={"temperature": 0.25, "do_sample": True},
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
)


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

In [2]:
llm.complete("What is Mistral AI?")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


CompletionResponse(text='\nMistral AI is an AI-driven weather forecasting system that provides real-time, accurate, and up-to-date weather forecasts for various regions worldwide.', additional_kwargs={}, raw={'model_output': tensor([[ 2061,   318, 15078,  1373,  9552,    30,   198, 49370,  1373,  9552,
           318,   281,  9552,    12, 15808,  6193, 41164,  1080,   326,  3769,
          1103,    12,  2435,    11,  7187,    11,   290,   510,    12,  1462,
            12,  4475,  6193, 26119,   329,  2972,  7652,  8688,    13, 50256]],
       device='cuda:0')}, delta=None)

In [3]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

embed_model = HuggingFaceEmbedding(model_name="../models/bge-small-en-v1.5")


In [4]:
embeddings = embed_model.get_text_embedding("머신 러닝은 재미있어요")
embeddings[:3]

[0.037243302911520004, -0.0008555697277188301, 0.08908320963382721]

# llamaindex from local model

In [5]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


documents = SimpleDirectoryReader("../dataset/llamaindex_data").load_data()
vector_index = VectorStoreIndex.from_documents(documents, embed_model=embed_model, similarity_top_k=1)
query_engine = vector_index.as_query_engine(llm=llm)


In [6]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


documents = SimpleDirectoryReader("../dataset/llamaindex_data").load_data()

vector_index = VectorStoreIndex.from_documents(documents, embed_model="local:../models/bge-small-en-v1.5", similarity_top_k=1)
query_engine = vector_index.as_query_engine(llm=llm)


In [7]:
response = query_engine.query("What is Mistral AI?")
response.response

Token indices sequence length is longer than the specified maximum sequence length for this model (1508 > 512). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


' Mistral AI is a French company founded in April 2023, co-founded by Arthur Mensch, Guillaume Lample, and Timothée Lacroix. The company produces open-source large language models, including Mistral 7B, Mixtral 8x7B, and Mistral Medium. The company also sells AI-powered multilingual conversation assistants, known as "Le Chat". The company\'s language processing model, Mistral 7B, has 7 billion parameters, while the "instruct" model has 7.3 billion parameters. The company also sells AI-powered multilingual conversation assistants, known as "Le Chat".'

# LLM QA system

## generate dataset

In [1]:
import os

api_key = "sk-..."
os.environ["OPENAI_API_KEY"] = api_key

os.environ.get("OPENAI_API_KEY")

'sk-...'

In [9]:
import requests

def get_wikipedia_page_links(title):
    S = requests.Session()

    URL = "https://en.wikipedia.org/w/api.php"

    params = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "links",
        "pllimit": "max"
    }

    all_links = []

    while True:
        response = S.get(url=URL, params=params).json()
        pages = response.get('query', {}).get('pages', {})
        for page_id, page_content in pages.items():
            links = page_content.get('links', [])
            for link in links:
                all_links.append(link['title'])

        if 'continue' in response:
            params['plcontinue'] = response['continue']['plcontinue']
        else:
            break

    return all_links

In [13]:
links_lm = get_wikipedia_page_links('Large language model')
len(links_lm)

400

In [14]:
from openai import OpenAI

client = OpenAI(api_key=api_key)

def get_response(prompt):
    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt},],
    )
    return response.choices[0].message.content.strip()

In [15]:
def get_article_abstract(title):
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "exintro": True,
            "explaintext": True,
        },
    ).json()
    page = next(iter(response['query']['pages'].values()))
    return page.get("extract", "")



def is_ml_related(title, abstract):
    prompt = f"""
    Given the abstract of the Wikipedia article, determine if the article is machine-learning or language model related.
    [Rules]
    - Exclude articles about specific individuals
    - Exclude articles that are about general knowledge and not related to machine-learning or langauge model.
    - Include articles about the developer, company.
    - Answer True if the article is related to machine-learning or language model, else False

    [title]
    {title}
    [abstract]
    {abstract}
    [answer]
    """

    return (get_response(prompt).lower().find("true") > -1)


In [18]:
idx = 0
links_lm[idx]

'AI-complete'

In [19]:
title = links_lm[idx]

In [20]:
abstract = get_article_abstract(title)
abstract

'In the field of artificial intelligence, the most difficult problems are informally known as AI-complete or AI-hard, implying that the difficulty of these computational problems, assuming intelligence is computational, is equivalent to that of solving the central artificial intelligence problem—making computers as intelligent as people, or strong AI.  To call a problem AI-complete reflects an attitude that it would not be solved by a simple specific algorithm.  \nAI-complete problems are hypothesised to include computer vision, natural language understanding, and dealing with unexpected circumstances while solving any real-world problem.Currently, AI-complete problems cannot be solved with modern computer technology alone, but would also require human computation.  This property could be useful, for example, to test for the presence of humans as CAPTCHAs aim to do, and for computer security to circumvent brute-force attacks.'

In [21]:
is_ml_related(title, abstract)

True

In [26]:
title = "ISBN"
abstract = get_article_abstract(title)
abstract

'The International Standard Book Number (ISBN) is a numeric commercial book identifier that is intended to be unique. Publishers purchase or receive ISBNs from an affiliate of the International ISBN Agency.An ISBN is assigned to each separate edition and variation (except reprintings) of a publication. For example, an e-book, a paperback and a hardcover edition of the same book must each have a different ISBN. The ISBN is ten digits long if assigned before 2007, and thirteen digits long if assigned on or after 1 January 2007. The method of assigning an ISBN is nation-specific and varies between countries, often depending on how large the publishing industry is within a country.\nThe initial ISBN identification format was devised in 1967, based upon the 9-digit Standard Book Numbering (SBN) created in 1966. The 10-digit ISBN format was developed by the International Organization for Standardization (ISO) and was published in 1970 as international standard ISO 2108 (the 9-digit SBN code 

In [27]:
is_ml_related(title, abstract)

False

In [28]:
from pathlib import Path

data_path = Path("../dataset/lm_texts")

if not data_path.exists():
    Path.mkdir(data_path)

In [29]:
def save_full_article(title):
    response = requests.get(
        "https://en.wikipedia.org/w/api.php",
        params={
            "action": "query",
            "format": "json",
            "titles": title,
            "prop": "extracts",
            "explaintext": True,
        },
    ).json()
    page = next(iter(response['query']['pages'].values()))
    full_text = page.get("extract", "")
    
    with open(f"../dataset/lm_texts/{title}.txt", "w", encoding="utf-8") as file:
        file.write(full_text)

In [30]:
title_valid = []

In [32]:
## 강의 현장 진행을 위한 tmp code
def is_ml_related(title, abstract):
    return False

In [33]:
from tqdm import tqdm

for title in tqdm(links_lm):
    abstract = get_article_abstract(title)
    if is_ml_related(title, abstract):
        save_full_article(title)
        title_valid.append(title)

100%|█████████████████████████████████████████| 400/400 [01:03<00:00,  6.33it/s]


In [38]:
len(title_valid)

297

## setting llamaindex pipeline

In [2]:
from llama_index.core import Settings 

Settings.chunk_size = 256

In [41]:
%%time
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader


documents_lm = SimpleDirectoryReader("../dataset/lm_texts").load_data()
vector_index_lm = VectorStoreIndex.from_documents(documents_lm, embed_model="local:../models/bge-small-en-v1.5", similarity_top_k=1)


CPU times: user 2min 46s, sys: 9.49 s, total: 2min 56s
Wall time: 2min 12s


### saving vector index

In [4]:
vector_index_lm.storage_context.persist(persist_dir="../models/vector_index_lm_text_bge")

#### loading vector index

In [5]:
from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="../models/vector_index_lm_text_bge")

vector_index_lm = load_index_from_storage(storage_context, embed_model="local:../models/bge-small-en-v1.5")

In [88]:
query_engine_lm = vector_index_lm.as_query_engine(llm=llm)

# Evaluate
## Evaluate - manual

In [9]:
query = "What is TD learning?"

print(llm.complete(query))

for engine in [query_engine, query_engine_lm]:
    print("========")
    response = engine.query(query)
    
    resp_text = response.response
    resp_nodes = response.source_nodes
    
    print(resp_text)
    print("===Sources====")
    for node in resp_nodes:
        print(node)
    print("=====\n\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Token indices sequence length is longer than the specified maximum sequence length for this model (575 > 512). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.




TD learning is a teaching method that involves teaching a subject through a series of tasks or exercises, rather than using a textbook. It emphasizes active learning, self-directed learning, and the development of practical skills.

TD (TensorFlow Dynamic) learning is a type of machine learning algorithm used in supervised learning, where the goal is to learn a mapping from input to output by iteratively adjusting the weights of a neural network. It is particularly useful for tasks like classification and regression, where the output is continuous and can take any value.
===Sources====
Node ID: f9207060-01d3-4dc6-8834-9185e974dee6
Text: OpenAI stated that full version of GPT-3 contains 175 billion
parameters, two orders of magnitude larger than the 1.5 billion
parameters in the full version of GPT-2 (although GPT-3 models with as
few as 125 million parameters were also trained).OpenAI stated that
GPT-3 succeeds at certain "meta-learning" tasks. It can generalize the
purpose of ...
Sc

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


 TD learning is a model-free reinforcement learning algorithm that learns by adjusting predictions based on current estimates, like dynamic programming methods, using bootstrapping.
===Sources====
Node ID: 290ca16a-84b0-4d39-b58c-63ed4aae00ef
Text: Temporal difference (TD) learning refers to a class of model-
free reinforcement learning methods which learn by bootstrapping from
the current estimate of the value function. These methods sample from
the environment, like Monte Carlo methods, and perform updates based
on current estimates, like dynamic programming methods.While Monte
Carlo meth...
Score:  0.789

Node ID: 89782e35-669d-4e30-b205-84623ab89781
Text: S_{t+1}}    are the current and next states, respectively. The
value                                    R                        t
+             1                             +         γ         V
(                    S                        t             +
1                             )                 {\displa...
Score:  0.773

In [46]:
query = "What is Claude3?"

print(llm.complete(query))

for engine in [query_engine, query_engine_lm]:
    print("========")
    response = engine.query(query)
    
    resp_text = response.response
    resp_nodes = response.source_nodes
    
    print(resp_text)
    print("===Sources====")
    for node in resp_nodes:
        print(node)
    print("=====\n\n")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Claude3 is a French term for "classical" or "high" art, referring to works created during the Classical era (around the 16th-19th centuries) in Europe.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Sora, a text-to-video model, can generate high-quality videos up to one minute long. It uses a 12-billion-parameter GPT-3 model to interpret natural language inputs and generate corresponding images. OpenAI trained the system using publicly-available videos as well as copyrighted videos licensed for the purpose. The technology behind Sora is an adaptation of the DALL·E 3 text-to-image model. OpenAI demonstrated a few high-quality videos to the general public on February 15, 2024, stating that the technology was able to generate videos up to one minute long. It also shared a technical report highlighting the methods used to train the model and the model's capabilities. It acknowledged some shortcomings of the system, including struggles in simulating complex physics. Will Douglas Heaven of the MIT Technology Review called the demonstration videos "impressive", but noted that they must have been cherry-picked and may not be representative of Sora's typical output.
===Sources====
Node ID

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Claude 3 is a language model developed by Anthropic that was released in March 4, 2024. It is known for its ability to perform meta-cognitive reasoning, such as recognizing it is being tested during needle in a haystack tasks.
===Sources====
Node ID: 09d7ccc3-8ae0-4f2a-8397-7888eb501c42
Text: === Claude 2 === Claude 2 was the next major iteration of
Claude, which was released in July 11 2023 and available to the
general public, whereas the Claude 1 was only available to selected
users approved by Anthropic.Claude 2 expanded its context window from
9,000 tokens to 100,000 tokens. Features included ability to upload
PDFs and other docu...
Score:  0.786

Node ID: d4bda3a7-d3b3-4a78-b364-2beb87d9c053
Text: === Claude 3 === Claude 3 was released on March 4, 2024 with
claims in the press release to have set new industry benchmarks across
a wide range of cognitive tasks. The Claude 3 family includes three
state-of-the-art models in ascending order of capability: Haiku,
Sonnet, and Opus. The 

## Evaluate- using GPT call
### Generate query from chunks

In [21]:
import random

nodes = list(vector_index_lm.docstore.docs.values())
nodes_sampled = [node.text for node in random.sample(nodes, 800)]

In [7]:
def gen_qa(context):
    prompt = f"""
    Given the context, generate the question-answer set
    
    [Rules]
    - The question and answer must be sufficiently related.
    - The question should be answerable by referring to the content of the context
    - The question must be in one sentence.
    - The answer must be three sentences or fewer.
    
    [Example A]
    [context]
    Perplexity
    The most commonly used measure of a language model's performance is its perplexity on a given text corpus. Perplexity is a measure of how well a model is able to predict the contents of a dataset; the higher the likelihood the model assigns to the dataset, the lower the perplexity. Mathematically, perplexity is defined as the exponential of the average negative log likelihood per token:
    Because language models may overfit to their training data, models are usually evaluated by their perplexity on a test set of unseen data.[38] This presents particular challenges for the evaluation of large language models. As they are trained on increasingly large corpora of text largely scraped from the web, it becomes increasingly likely that models' training data inadvertently includes portions of any given test set.[6
    
    [question]
    What is perplexity?
    
    [answer]
    Perplexity is a metric for assessing how effectively a language model can forecast the contents of a dataset, commonly used as a measure of a language model's
    
    
    [Task]
    [context]
    {context}
    [question]
    """

    return get_response(prompt)


def parse_qa(qa):
    question, answer = qa.split("[answer]")
    question = question.strip()
    answer = answer.strip()

    return question, answer

In [35]:
context = nodes[0]
qa = gen_qa(context)
q, a = parse_qa(qa)

In [None]:
context

In [37]:
qa

'What do the terms AI-complete or AI-hard refer to in the field of artificial intelligence? \n\n[answer]\nIn the field of artificial intelligence, the terms AI-complete or AI-hard refer to the most challenging computational problems, suggesting that solving these problems is equivalent in difficulty to making computers as intelligent as humans.'

In [38]:
q, a

('What do the terms AI-complete or AI-hard refer to in the field of artificial intelligence?',
 'In the field of artificial intelligence, the terms AI-complete or AI-hard refer to the most challenging computational problems, suggesting that solving these problems is equivalent in difficulty to making computers as intelligent as humans.')

In [12]:
import pandas as pd
from tqdm import tqdm

qa_pairs = []
questions = []
answers = []
contexts = []
for node in tqdm(nodes_sampled[:]):
    pass
    qa = gen_qa(node)
    try:
        q, a = parse_qa(qa)

        contexts.append(node)
        qa_pairs.append(qa)
        questions.append(q)
        answers.append(a)
    except Exception as exp:
        print(exp)


In [None]:
df_qa = pd.DataFrame({'context': contexts, 'qa_pairs': qa_pairs, 'question': questions, 'answer': answers})

In [15]:
df_qa = pd.read_csv("../dataset/ml_qa_raw.csv")
df_qa[:20].to_csv("../dataset/ml_qa_test.csv", index=False)
df_qa[20:].to_csv("../dataset/ml_qa_train.csv", index=False)

In [None]:
df_qa

### Generate query from selected models

In [30]:
new_models = ["Claude 3", "Gemini 1.5", "Mixtral8x7B", "Llama2", "PanGu"]

In [33]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader

documents = SimpleDirectoryReader("../dataset/lm_texts").load_data()

vector_index = VectorStoreIndex.from_documents(documents)
vector_index.storage_context.persist(persist_dir="../models/vector_index_lm_text_chatGPT")

In [12]:
from llama_index.core.retrievers import VectorIndexRetriever

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)

In [41]:
model_contexts = []
for model in new_models:
    result = retriever.retrieve(model)[0].text
    model_contexts.append(result)

In [None]:
model_contexts

In [78]:
def gen_qa_new_models(context, model):
    prompt = f"""
    Given the context and model, generate the question-answer set
    
    [Rules]
    - Generate questions about the model, and create answers to them based on the given context.
    - Create a simple one-sentence question focusing on one aspect of the model's characteristics.
    - Keep the answer to no more than two sentences.
    
    [Example A]
    [context]
    GPT-J or GPT-J-6B is an open-source large language model (LLM) developed by EleutherAI in 2021.[1] As the name suggests, it is a generative pre-trained transformer model designed to produce human-like text that continues from a prompt. The optional "6B" in the name refers to the fact that it has 6 billion parameters.[2]
    [question]
    When is the initial relase of GPT-J?
    
    [answer]
    The initial release of GPT-J, an open-source large language model developed by EleutherAI, was in June 2021.
    
    [Task]
    [context]
    {context}
    [model]
    {model}
    [question]
    """

    return get_response(prompt)

In [81]:
qa_pairs = []
questions = []
answers = []
contexts = []
for model, context in zip(new_models, model_contexts):
    qa = gen_qa_new_models(model, context)
    try:
        q, a = parse_qa(qa)
    
        contexts.append(context)
        qa_pairs.append(qa)
        questions.append(q)
        answers.append(a)
    except Exception as exp:
        pritn(exp)


In [None]:
df_qa_new_model = pd.DataFrame({'context': contexts, 'qa_pairs': qa_pairs, 'question': questions, 'answer': answers})
df_qa_new_model.to_csv("../dataset/ml_qa_new_models.csv")

In [84]:
df_qa_new_model

Unnamed: 0,context,qa_pairs,question,answer
0,Claude is a family of large language models de...,What is Claude 3?\n \n [answer]\n Cla...,What is Claude 3?,Claude 3 is a software application used for de...
1,== Technical specifications ==\nThe first gene...,What is the latest version of Gemini? \n \n...,What is the latest version of Gemini?,The latest version of Gemini is 1.5.
2,==== Mixtral 8x7B ====\nMuch like Mistral's fi...,What is the key feature of Mixtral8x7B?\n \...,What is the key feature of Mixtral8x7B?,Mixtral8x7B is known for its 7 billion paramet...
3,=== Fine-tuning ===\nLlama 1 models are only a...,What is the purpose of Llama2?\n\n[answer]\nLl...,What is the purpose of Llama2?,Llama2 is a language model designed to generat...
4,"Huawei PanGu, PanGu, PanGu-Σ or PanGu-π is a m...",What is PanGu? \n \n[answer]\nPanGu is a Ch...,What is PanGu?,PanGu is a Chinese natural language processing...


## generate response

In [4]:
from pathlib import Path

data_path = Path("../results/inference_result")
eval_data_path = Path("../results/eval_result")


if not data_path.exists():
    Path.mkdir(data_path)

if not data_path.exists():
    Path.mkdir(eval_data_path)

In [19]:
df_test = df_qa[:10]
llm_answer = []
qe_answer = []

for idx, row in tqdm(df_test.iterrows()):
    query = row['question']
    resp_llm = llm.complete(query)
    resp_qe = query_engine_lm.query(query)
    llm_answer.append(resp_llm)
    qe_answer.append(resp_qe)


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
1it [00:06,  6.01s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2it [00:11,  5.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
3it [00:16,  5.50s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
4it [00:22,  5.46s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
5it [00:27,  5.32s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
6it [00:

In [None]:
df_eval = pd.concat([df_test, df_test])
df_eval['answer_llm'] = llm_answer + qe_answer
df_eval['model'] = ['llm']*len(llm_answer) + ['qe']*len(qe_answer)
df_eval.to_csv(f"{data_path}/qa_inferenced.csv", index=False)

In [89]:
llm_answer = []
qe_answer = []

for idx, row in tqdm(df_qa_new_model.iterrows()):
    query = row['question']
    resp_llm = llm.complete(query)
    resp_qe = query_engine_lm.query(query)
    llm_answer.append(resp_llm)
    qe_answer.append(resp_qe)


0it [00:00, ?it/s]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Token indices sequence length is longer than the specified maximum sequence length for this model (634 > 512). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
1it [00:06,  6.30s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
2it [00:10,  4.87s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
3it [00:14,  4.85s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
4it [00:20,  5.22s/it]Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [None]:
df_eval = pd.concat([df_qa_new_model, df_qa_new_model])
df_eval['answer_llm'] = llm_answer + qe_answer
df_eval['model'] = ['llm']*len(llm_answer) + ['qe']*len(qe_answer)
df_eval.to_csv(f"{data_path}/qa_models_inferenced.csv", index=False)

## result

In [11]:
df_result = pd.read_csv(f"{eval_data_path}/qa_inferenced_eval.csv")
df_result

Unnamed: 0,context,qa_pairs,question,answer,answer_llm,model,result,score,feedback
0,Fuzzy clustering (also referred to as soft clu...,What is fuzzy clustering and how does it diffe...,What is fuzzy clustering and how does it diffe...,Fuzzy clustering allows data points to belong ...,\n\nFuzzy clustering is a data analysis techni...,llm,query='What is fuzzy clustering and how does i...,4.5,The generated answer correctly explains what f...
1,== Training ==\nAn RNN using LSTM units can be...,What problem does the use of gradient descent ...,What problem does the use of gradient descent ...,Using gradient descent for standard RNNs can l...,\n\nGradient descent poses a problem for stand...,llm,query='What problem does the use of gradient d...,4.5,The generated answer is relevant and mostly co...
2,,What is the purpose of evaluating language mod...,What is the purpose of evaluating language mod...,Evaluating language models using perplexity he...,\nPerplexity is a measure of how difficult it ...,llm,query='What is the purpose of evaluating langu...,3.5,The generated answer is relevant to the user q...
3,== Definition ==\nLet \n \n \n \n ...,What is a hidden Markov model?\n \n[answer]...,What is a hidden Markov model?,A hidden Markov model consists of a pair of di...,\nA hidden Markov model (HMM) is a statistical...,llm,query='What is a hidden Markov model?' context...,4.5,The generated answer is relevant and mostly co...
4,== Inception Program ==\nNvidia's Inception Pr...,What was the controversy surrounding Nvidia an...,What was the controversy surrounding Nvidia an...,Nvidia faced controversy when they decided to ...,\n\nHardware Unboxed was a video game review s...,llm,query='What was the controversy surrounding Nv...,1.0,The generated answer is not relevant to the us...
5,== Deep learning ==\nThe previous section desc...,What is the key design desideratum for MoE in ...,What is the key design desideratum for MoE in ...,The key design desideratum for MoE in deep lea...,\n\nThe key design desideratum for MoE in deep...,llm,query='What is the key design desideratum for ...,2.0,The generated answer is relevant to the user q...
6,"In computing, the term text processing refers ...",What is text processing in computing?\n\n[answ...,What is text processing in computing?,Text processing in computing refers to automat...,\nText processing in computing refers to the p...,llm,query='What is text processing in computing?' ...,4.5,The generated answer is relevant and mostly co...
7,== Limitations ==\nRLHF suffers from challenge...,What are some limitations of Reinforcement Lea...,What are some limitations of Reinforcement Lea...,RLHF faces challenges in gathering human feedb...,\n1. Limited Feedback: RLHF relies on user fee...,llm,query='What are some limitations of Reinforcem...,4.5,The generated answer is highly relevant to the...
8,=== Training an autoencoder ===\nAn autoencode...,What defines the task for judging the quality ...,What defines the task for judging the quality ...,The task for assessing the quality of an autoe...,\nThe task for judging the quality of an autoe...,llm,query='What defines the task for judging the q...,3.5,The generated answer is relevant to the user q...
9,{\displaystyle {\begin{aligned}Q(\theta \mid \...,What is the significance of the M step in the ...,What is the significance of the M step in the ...,The M step in the expectation maximization (EM...,\n\nThe M step in the context of expectation m...,llm,query='What is the significance of the M step ...,4.5,The generated answer is relevant and mostly co...


In [12]:
df_result[['model', 'score']].groupby('model').mean()

Unnamed: 0_level_0,score
model,Unnamed: 1_level_1
llm,3.7
qe,4.25


In [13]:
df_result_models = pd.read_csv(f"{eval_data_path}/qa_models_inferenced_eval.csv")
df_result_models

Unnamed: 0.1,Unnamed: 0,context,qa_pairs,question,answer,answer_llm,model,result,score,feedback
0,0,Claude is a family of large language models de...,What is Claude 3?\n \n [answer]\n Cla...,What is Claude 3?,Claude 3 is a software application used for de...,\n\nClaude 3 is a French comedy film directed ...,llm,query='What is Claude 3?' contexts=None respon...,1.0,The generated answer is not relevant to the us...
1,1,== Technical specifications ==\nThe first gene...,What is the latest version of Gemini? \n \n...,What is the latest version of Gemini?,The latest version of Gemini is 1.5.,\n\nThe latest version of Gemini is 0.13.1.,llm,query='What is the latest version of Gemini?' ...,2.0,The generated answer is relevant to the user q...
2,2,==== Mixtral 8x7B ====\nMuch like Mistral's fi...,What is the key feature of Mixtral8x7B?\n \...,What is the key feature of Mixtral8x7B?,Mixtral8x7B is known for its 7 billion paramet...,\nThe key feature of Mixtral8x7B is its abilit...,llm,query='What is the key feature of Mixtral8x7B?...,1.0,The generated answer is not relevant to the us...
3,3,=== Fine-tuning ===\nLlama 1 models are only a...,What is the purpose of Llama2?\n\n[answer]\nLl...,What is the purpose of Llama2?,Llama2 is a language model designed to generat...,\nLlama2 is a free and open-source software pr...,llm,query='What is the purpose of Llama2?' context...,1.0,The generated answer is not relevant to the us...
4,4,"Huawei PanGu, PanGu, PanGu-Σ or PanGu-π is a m...",What is PanGu? \n \n[answer]\nPanGu is a Ch...,What is PanGu?,PanGu is a Chinese natural language processing...,"\nPanGu is a popular Chinese herbal medicine, ...",llm,query='What is PanGu?' contexts=None response=...,1.0,The generated answer is not relevant to the us...
5,5,Claude is a family of large language models de...,What is Claude 3?\n \n [answer]\n Cla...,What is Claude 3?,Claude 3 is a software application used for de...,\nClaude 3 is a language model developed by An...,qe,query='What is Claude 3?' contexts=None respon...,1.0,The generated answer is not relevant to the us...
6,6,== Technical specifications ==\nThe first gene...,What is the latest version of Gemini? \n \n...,What is the latest version of Gemini?,The latest version of Gemini is 1.5.,"\nGemini 1.0 was announced on December 6, 2023...",qe,query='What is the latest version of Gemini?' ...,1.0,The generated answer does not provide the info...
7,7,==== Mixtral 8x7B ====\nMuch like Mistral's fi...,What is the key feature of Mixtral8x7B?\n \...,What is the key feature of Mixtral8x7B?,Mixtral8x7B is known for its 7 billion paramet...,\nThe key feature of Mixtral8x7B is its use of...,qe,query='What is the key feature of Mixtral8x7B?...,2.5,The generated answer is relevant to the user q...
8,8,=== Fine-tuning ===\nLlama 1 models are only a...,What is the purpose of Llama2?\n\n[answer]\nLl...,What is the purpose of Llama2?,Llama2 is a language model designed to generat...,\nLlama 2 is a chatbot and conversational agen...,qe,query='What is the purpose of Llama2?' context...,4.5,The generated answer is relevant and mostly co...
9,9,"Huawei PanGu, PanGu, PanGu-Σ or PanGu-π is a m...",What is PanGu? \n \n[answer]\nPanGu is a Ch...,What is PanGu?,PanGu is a Chinese natural language processing...,\nPanGu is a large learning language model dev...,qe,query='What is PanGu?' contexts=None response=...,4.5,The generated answer is relevant and correct. ...


In [14]:
df_result_models[['model', 'score']].groupby('model').mean()

Unnamed: 0_level_0,score
model,Unnamed: 1_level_1
llm,1.2
qe,2.7


## results - finetuning model

In [15]:
df_result_ft = pd.read_csv(f"{eval_data_path}/qa_inferenced_ft_eval.csv")
df_result_all = pd.concat([df_result, df_result_ft])
df_result_all[['model', 'score']].groupby('model').mean()

Unnamed: 0_level_0,score
model,Unnamed: 1_level_1
finetune,3.3
llm,3.7
qe,4.25


In [16]:
for idx, row in df_result_ft.iterrows():
    print(idx)
    print("==query==")
    print(row['question'])
    print("==answer_llm==")
    print(row['answer_llm'])
    print("==score==")
    print(row['score'])   
    print("==feedback==")
    print(row['feedback'])
    print("\n\n")

0
==query==
What is fuzzy clustering and how does it differ from hard clustering?
==answer_llm==
Fuzzy clustering is a clustering method that uses a fuzzy membership function to assign a cluster label to each data point. It differs from hard clustering in that it allows for multiple clusters and does not require the data points to be in a specific order. Fuzzy clustering is often used in scenarios where the data points are not ordered or have a high degree of uncertainty.
==score==
3.0
==feedback==
The generated answer is relevant to the user query, but it contains some inaccuracies. It correctly explains that fuzzy clustering assigns a cluster label to each data point using a fuzzy membership function, but it incorrectly states that fuzzy clustering allows for multiple clusters and does not require the data points to be in a specific order. The key difference between fuzzy and hard clustering is that in fuzzy clustering, data points can belong to multiple clusters with varying degrees

In [17]:
df_result_models_ft = pd.read_csv(f"{eval_data_path}/qa_models_inferenced_ft_eval.csv")
df_result_models_all = pd.concat([df_result_models, df_result_models_ft])
df_result_models_all[['model', 'score']].groupby('model').mean()

Unnamed: 0_level_0,score
model,Unnamed: 1_level_1
finetune,1.5
llm,1.2
qe,2.7


In [18]:
for idx, row in df_result_models_all.sort_values(['question', 'model']).iterrows():
    print(idx, row['model'])
    print("==query==")
    print(row['question'])
    print("==answer_llm==")
    print(row['answer_llm'])
    print("==score==")
    print(row['score'])   
    print("==feedback==")
    print(row['feedback'])
    print("\n\n")

0 finetune
==query==
What is Claude 3?
==answer_llm==
Claude 3 is a machine learning algorithm that uses a combination of supervised learning and reinforcement learning to train a model to predict the output of a system, such as a computer or a human, based on the input provided by the system. The algorithm is designed to learn the system's output by interacting with it and receiving feedback from the system. The algorithm is used to train models for various tasks such as image recognition, natural language processing, and speech recognition.
==score==
1.0
==feedback==
The generated answer is not relevant to the user query. The user asked about Claude 3, a software application used for design and analysis in the field of environmental engineering, but the generated answer describes it as a machine learning algorithm, which is incorrect.



0 llm
==query==
What is Claude 3?
==answer_llm==


Claude 3 is a French comedy film directed by Claude Chabrol and starring Jean-Paul Belmondo, Simo