In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftModel
import json 

base_model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_id,  # Llama 2 7b, same as before
    quantization_config=bnb_config,  # Same quantization config as before
    device_map="auto",
    trust_remote_code=True,
)

eval_tokenizer = AutoTokenizer.from_pretrained(
    base_model_id,
    add_bos_token=True,
    trust_remote_code=True,
)



ft_model = PeftModel.from_pretrained(base_model, "/home/featurize/work/TinyLLaMA/src/outputs/tinyllama-finetune-2024-04-16-04-06/checkpoint-1000")
tokenizer = AutoTokenizer.from_pretrained(base_model_id)
tokenizer.pad_token = tokenizer.eos_token

with open("/home/featurize/work/TinyLLaMA/src/data/query_test.json", "r") as f:
    query = json.load(f)
    
eval_prompt = f""" You are an expert at extracting useful information from user queries. I need you to extract meta information from the user's query.  The extraction reults contain 'year', 'month', 'day', 'file content', 'file type' information for file retriever to locate the file. The extracted information should exclusively contain key-value pairs. Additionally, please generate 5 synonyms for the extracted 'file content'. Below are 5 examples that meet these requirements:
Example1
### query: Project documentation from January 15, 2024, to February 20, 2024
### information: {{'year': [2024, 2024], 'month': [1, 2], 'day': [15, 20], 'file content': ['Project Documentation', 'Project Files', 'Project Overview', 'Project Details', 'Project Progress Documentation'], 'file type': ['pdf', 'doc', 'docx']}}

Example2
### query: Find my photos from New York last summer
### information: {{'year': [-1, -1], 'month': [6, 8], 'day': [0, 0], 'file content': ['Photo taken in New York', 'New York Image', 'New York Snapshot', 'New York Picture', 'New York Photograph'], 'file type': ['jpg', 'jpeg', 'png', 'heif', 'tiff']}}

Example3
### query: How is AI transforming healthcare diagnostics?
### information: {{'year': [], 'month': [], 'day': [], 'file content': ['AI in Healthcare Diagnostics', 'Artificial Intelligence and Medical Imaging', 'Machine Learning for Early Detection', 'AI Applications in Healthcare', 'Innovations in AI-based Diagnostics'], 'file type': ['pdf', 'docx', 'pptx', 'mp4', 'mp3']}}

Example4
### query: Conference materials from the Global Tech Summit held from 2023/10/10 to 2023/10/12
### information: {{'year': [2023, 2023], 'month': [10, 10], 'day': [10, 12], 'file content' : ['Global Tech Summit Materials', 'Tech Summit Presentations', 'Tech Conference Docs', 'Tech Summit Slides', 'Tech Summit Proceedings'], 'file type': ['pdf', 'pptx', 'doc', 'docx']}}

Example5
### query: The best ways to introduce coding to children
### information: {{'year': [], 'month': [], 'day': [], 'file content': ['Coding for Kids', 'Children\'s Programming Basics', 'Fun Coding Projects for Kids', 'Learning to Code Through Games', 'Introduction to Programming for Young Learners'], 'file type': ['pdf', 'docx', 'pptx', 'mp4']}}

Now, please extract meta information from this user query:
### query: {"photos do not contain cats"}
### information: """    
#     return full_prompt


# {query[4]["input"]}

eval_prompt = ""
model_input = tokenizer(eval_prompt, return_tensors="pt").to("cuda")
ft_model.eval()
with torch.no_grad():
    print(eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=150)[0], skip_special_tokens=True))

SyntaxError: invalid syntax (2651671034.py, line 66)

In [None]:
import re
from pprint import pprint
import ast

def extract_answer(prediction):

    # Extracting the part of the text after the specified prompt
    relevant_part = prediction.split("Now, please extract meta information from this user query:")[-1].strip()

    # Updated regex pattern to accommodate potential variations in formatting
    pattern = r"### query:\s*(.*?)\n### information:\s*(\{.*?\})\s*"

    # Search for the pattern in the relevant part of the text
    match = re.search(patt0ern, relevant_part, re.DOTALL)

    extracted_information = {}

    if match:
        # Extract query and information string
        query, information_str = match.groups()
        # Convert information string into a Python dictionary
        information = ast.literal_eval(information_str)
        extracted_information["query"] = query
        extracted_information["information"] = information
    return extracted_information


In [None]:
%env OPENAI_API_KEY= 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

def evaluate_by_gpt(truth, pred):
    pred = str(extract_answer(pred)["information"])
    truth = str(truth["information"])
    
    prompt = """ 
    You are an expert at evaluating LLMs predictions. The output contains 'year', 'month', 'day', 'file content', 'file type' information.
    'year', 'month', 'day' should be exactly the same. For 'file content' and 'file type', you can just qualitatively evaluate it by measuring the
    semantic similarty without being too strict. Based on the above evaluation metric, if you think the prediction is good, return 1, otherwise, return 0.
    your response should be only one numer: 0 or 1. Here is the ground truth label and prediction results.
    
    ### Ground Truth: {truth}
    ### Prediction: {pred}
        """
    client = OpenAI()
    response = client.chat.completions.create(
        model='gpt-4', #gpt-4
        messages=[
            {
                'role': 'user',
                'content': [
                    {'type': 'text', 'text': prompt},
                ],
            }
        ],
        max_tokens=2000,
    )
    query_tasks = response.choices[0].message.content
    return query_tasks
    

In [22]:
import re
from pprint import pprint
import ast

# Input text, potentially long, containing various sections
input_text = """
Example1
### query: Project documentation from January 15, 2024, to February 20, 2024
### information: {'year': [2024, 2024], 'month': [1, 2], 'day': [15, 20], 'file content': ['Project Documentation', 'Project Files', 'Project Overview', 'Project Details', 'Project Progress Documentation'], 'file type': ['pdf', 'doc', 'docx']}

Now, please extract meta information from this user query:
### query: How to use LLMs or leverage them for recommendation engine
### information:  {'year': [2024, 2024], 'month': [1, 2], 'day': [15, 20], 'file content': ['Recommendation Engine', 'Recommendation System', 'Recommendation Algorithm', 'Recommendation System', 'Recommendation Engine'], 'file type': ['pdf', 'doc', 'docx', 'txt', 'md']}

### query: Find my photos from New York last summer
### information: {'year': [-1, -1], 'month': [6, 8], 'day': [0, 0], 'file content': ['Photo taken in New
"""

# Isolate the portion of the text after the specific prompt
relevant_part = input_text.split("Now, please extract meta information from this user query:")[-1].strip()

# Regular expression to match the query and its corresponding information
pattern = r"### query:\s*(.*?)\n### information:\s*(\{.*?\})\s*"

# Find the first match in the relevant part of the text
match = re.search(pattern, relevant_part, re.DOTALL)

# Initialize result dictionary
extracted_information = {}

if match:
    query, information_str = match.groups()
    # Safely evaluate the information string into a Python dictionary
    information = ast.literal_eval(information_str)
    extracted_information["query"] = query
    extracted_information["information"] = information

# Display the extracted information
pprint(extracted_information)


{'information': {'day': [15, 20],
                 'file content': ['Recommendation Engine',
                                  'Recommendation System',
                                  'Recommendation Algorithm',
                                  'Recommendation System',
                                  'Recommendation Engine'],
                 'file type': ['pdf', 'doc', 'docx', 'txt', 'md'],
                 'month': [1, 2],
                 'year': [2024, 2024]},
 'query': 'How to use LLMs or leverage them for recommendation engine'}


In [23]:
import re
from pprint import pprint
import ast

# Updated input text as an example
input_text = """
Example1
### query: Project documentation from January 15, 2024, to February 20, 2024
### information: {'year': [2024, 2024], 'month': [1, 2], 'day': [15, 20], 'file content': ['Project Documentation', 'Project Files', 'Project Overview', 'Project Details', 'Project Progress Documentation'], 'file type': ['pdf', 'doc', 'docx']}

Now, please extract meta information from this user query:
### query: How to use LLMs or leverage them for recommendation engine
### information:  {'year': [2024, 2024], 'month': [1, 2], 'day': [15, 20], 'file content': ['Recommendation Engine', 'Recommendation System', 'Recommendation Algorithm', 'Recommendation System', 'Recommendation Engine'], 'file type': ['pdf', 'doc', 'docx', 'txt', 'md']}

### query: Find my photos from New York last summer
### information: {'year': [-1, -1], 'month': [6, 8], 'day': [0, 0], 'file content': ['Photo taken in New
"""

# Extracting the part of the text after the specified prompt
relevant_part = input_text.split("Now, please extract meta information from this user query:")[-1].strip()

# Updated regex pattern to accommodate potential variations in formatting
pattern = r"### query:\s*(.*?)\n### information:\s*(\{.*?\})\s*"

# Search for the pattern in the relevant part of the text
match = re.search(pattern, relevant_part, re.DOTALL)

extracted_information = {}

if match:
    # Extract query and information string
    query, information_str = match.groups()
    # Convert information string into a Python dictionary
    information = ast.literal_eval(information_str)
    extracted_information["query"] = query
    extracted_information["information"] = information

# Print the extracted information
pprint(extracted_information)


{'information': {'day': [15, 20],
                 'file content': ['Recommendation Engine',
                                  'Recommendation System',
                                  'Recommendation Algorithm',
                                  'Recommendation System',
                                  'Recommendation Engine'],
                 'file type': ['pdf', 'doc', 'docx', 'txt', 'md'],
                 'month': [1, 2],
                 'year': [2024, 2024]},
 'query': 'How to use LLMs or leverage them for recommendation engine'}


In [24]:
input_text = eval_tokenizer.decode(ft_model.generate(**model_input, max_new_tokens=150)[0], skip_special_tokens=True)

# Isolate the portion of the text after the specific prompt
relevant_part = input_text.split("Now, please extract meta information from this user query:")[-1].strip()

# Regular expression to match the query and its corresponding information
pattern = r"### query:\s*(.*?)\n### information:\s*(\{.*?\})\s*"

# Find the first match in the relevant part of the text
match = re.search(pattern, relevant_part, re.DOTALL)

# Initialize result dictionary
extracted_information = {}

if match:
    query, information_str = match.groups()
    # Safely evaluate the information string into a Python dictionary
    information = ast.literal_eval(information_str)
    extracted_information["query"] = query
    extracted_information["information"] = information

# Display the extracted information
pprint(extracted_information)

{'information': {'day': [15, 20],
                 'file content': ['Recommendation Engine',
                                  'Recommendation System',
                                  'Recommendation Algorithm',
                                  'Recommendation System',
                                  'Recommendation Engine'],
                 'file type': ['pdf', 'doc', 'docx', 'txt', 'md'],
                 'month': [1, 2],
                 'year': [2024, 2024]},
 'query': 'How to use LLMs or leverage them for recommendation engine'}
