# Load in model

first load in the model

In [1]:
#!/usr/bin/env python
# coding: utf-8

# In[2]:


import logging
import sys

from llama_index.callbacks import CallbackManager, LlamaDebugHandler
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import messages_to_prompt, completion_to_prompt

from llama_index import VectorStoreIndex, SimpleDirectoryReader
from llama_index import ServiceContext
from llama_index import set_global_tokenizer
from transformers import AutoTokenizer
from llama_index.embeddings import HuggingFaceEmbedding

logging.basicConfig(stream=sys.stdout, level=logging.INFO)  # Change INFO to DEBUG if you want more extensive logging
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

llama_debug = LlamaDebugHandler(print_trace_on_end=True)
callback_manager = CallbackManager([llama_debug])

llm = LlamaCPP(
    
    # optionally, you can set the path to a pre-downloaded model instead of model_url
    model_path="/global/scratch/users/lgoldnercohentzedek/llm-docs/models/7B/ggml-model-q4_0.bin",
    
    temperature=0.1,
    max_new_tokens=256,
    
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=3900,  # note, this sets n_ctx in the model_kwargs below, so you don't need to pass it there.
    
    # kwargs to pass to __call__()
    generate_kwargs={},
    
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": 40}, # I need to play with this and see if it actually helps
    
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)


embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")

# create a service context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    chunk_size=512
)

documents = SimpleDirectoryReader("/global/scratch/users/lgoldnercohentzedek/llm-docs/flat-rit-docs").load_data()

print("Docs Loaded!")
index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)
print("Index Created!")
# set up query engine
query_engine = index.as_query_engine(similarity_top_k=2)

#response = query_engine.query("What is Savio?")
#print(response)



# In[ ]:



  from .autonotebook import tqdm as notebook_tqdm
ggml_init_cublas: GGML_CUDA_FORCE_MMQ:   no
ggml_init_cublas: CUDA_USE_TENSOR_CORES: yes
ggml_init_cublas: found 4 CUDA devices:
  Device 0: Tesla K80, compute capability 3.7, VMM: yes
  Device 1: Tesla K80, compute capability 3.7, VMM: yes
  Device 2: Tesla K80, compute capability 3.7, VMM: yes
  Device 3: Tesla K80, compute capability 3.7, VMM: yes
llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from /global/scratch/users/lgoldnercohentzedek/llm-docs/models/7B/ggml-model-q4_0.bin (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = llama
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_load

Docs Loaded!
Index Created!


# create pipeline for retrieving source text

In [2]:
from llama_index import (
    VectorStoreIndex,
    SimpleDirectoryReader,
    Response,
)
from llama_index.evaluation import FaithfulnessEvaluator
from llama_index.node_parser import SentenceSplitter
import pandas as pd





evaluator_llama = FaithfulnessEvaluator(service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    chunk_size=512
))

pd.set_option("display.max_colwidth", 0)

# create vector index
splitter = SentenceSplitter(chunk_size=512)
vector_index = VectorStoreIndex.from_documents(
    documents, service_context=service_context
)



from llama_index.evaluation import EvaluationResult


# define jupyter display function
def display_eval_df(response: Response, eval_result: EvaluationResult) -> None:
    if response.source_nodes == []:
        print("no response!")
        return
    eval_df = pd.DataFrame(
        {
            "Response": str(response),
            "Source": response.source_nodes[0].node.text[:1000] + "...",
            "Evaluation Result": "Pass" if eval_result.passing else "Fail",
            "Reasoning": eval_result.feedback,
        },
        index=[0],
    )
    eval_df = eval_df.style.set_properties(
        **{
            "inline-size": "600px",
            "overflow-wrap": "break-word",
        },
        subset=["Response", "Source"]
    )
    display(eval_df)
    
query_engine_eval = vector_index.as_query_engine()


## ask a question and get the response

In [3]:
import nest_asyncio
nest_asyncio.apply()

def get_response_and_source(response: Response) -> (str, str):
    if not response.source_nodes:
        return "No response!", "No source available."

    # Assuming you're interested in the first source node for simplicity
    response_text = str(response)
    source_text = response.source_nodes[0].node.text[:1000] + "..." if len(response.source_nodes[0].node.text) > 1000 else response.source_nodes[0].node.text

    return response_text, source_text

# Example usage
response_vector = query_engine_eval.query("what security levels of data can savio be used with?")
eval_result = evaluator_llama.evaluate_response(response=response_vector)

response_text, source_text = get_response_and_source(response_vector)
print("Response:", response_text)







llama_print_timings:        load time =    4530.43 ms
llama_print_timings:      sample time =      45.83 ms /    89 runs   (    0.51 ms per token,  1942.04 tokens per second)
llama_print_timings: prompt eval time =    7379.51 ms /   697 tokens (   10.59 ms per token,    94.45 tokens per second)
llama_print_timings:        eval time =   21831.65 ms /    88 runs   (  248.09 ms per token,     4.03 tokens per second)
llama_print_timings:       total time =   29480.28 ms /   785 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =    4530.43 ms
llama_print_timings:      sample time =       1.55 ms /     3 runs   (    0.52 ms per token,  1939.24 tokens per second)
llama_print_timings: prompt eval time =    7491.45 ms /   881 tokens (    8.50 ms per token,   117.60 tokens per second)
llama_print_timings:        eval time =     524.79 ms /     2 runs   (  262.39 ms per token,     3.81 tokens per second)
llama_print_timings:       total time =    8078.34 ms /   883 

Response:   Based on the provided context information, Savio can be used with moderately sensitive data (P2/P3 data) as defined by UC Policy (IS-3) and documented by the campus Information Security Office. Savio is not appropriate for highly sensitive (P4) data, and researchers should consult the Secure Research Data and Compute web page for information on the service that supports working with highly sensitive data.


# retreive URL of source

In [4]:
import os
import difflib

def calculate_similarity(file_text, source_text):
    return difflib.SequenceMatcher(None, file_text, source_text).ratio()

def search_markdown_files(root_dir, source_text, starting_similarity_threshold=0.05, threshold_increment=0.01):
    current_threshold = starting_similarity_threshold
    matched_files = []
    
    # Continue searching until one or two files are left
    while len(matched_files) > 2 or len(matched_files) == 0:
        matched_files = []
        for subdir, dirs, files in os.walk(root_dir):
            for filename in files:
                if filename.endswith('.md'):
                    filepath = os.path.join(subdir, filename)
                    with open(filepath, 'r', encoding='utf-8') as file:
                        file_text = file.read()
                    similarity = calculate_similarity(file_text, source_text)
                    if similarity >= current_threshold:
                        matched_files.append((filepath, similarity))
                        #print([(path, round(sim, 2)) for path, sim in matched_files])
        # If more than two files match, increase the threshold
        if len(matched_files) > 2:
            current_threshold += threshold_increment

    # Sort matched files by similarity, highest first
    matched_files.sort(key=lambda x: x[1], reverse=True)
    
    # Construct the URL from the most similar file's path
    if matched_files:
        matched_file_path = matched_files[0][0]  # File with the highest similarity
        # Extract the part of the path after "services" and construct the URL
        url = "https://docs-research-it.berkeley.edu/services" + matched_file_path.split("services", 1)[1].replace(".md", "")
        return url
    else:
        return "No matching URL found."

# Example usage

root_directory = '/global/scratch/users/jejacob/rit-docs-main/docs'
matched_url = search_markdown_files(root_directory, source_text)
print("Matched URL:", matched_url)



Matched URL: https://docs-research-it.berkeley.edu/services/high-performance-computing/getting-account/sensitive-accounts


In [5]:
print("""

Savio can be used with moderately sensitive data (P2/P3 data) as defined by UC Policy (IS-3) and documented by the campus Information Security Office. Savio is not appropriate for highly sensitive (P4) data, and researchers should consult the Secure Research Data and Compute web page for information on the service that supports working with highly sensitive data. Here is a link to further documentation: https://docs-research-it.berkeley.edu/services/high-performance-computing/getting-account/sensitive-accounts 


"""
     )





Savio can be used with moderately sensitive data (P2/P3 data) as defined by UC Policy (IS-3) and documented by the campus Information Security Office. Savio is not appropriate for highly sensitive (P4) data, and researchers should consult the Secure Research Data and Compute web page for information on the service that supports working with highly sensitive data. Here is a link to further documentation: https://docs-research-it.berkeley.edu/services/high-performance-computing/getting-account/sensitive-accounts 



