In [2]:
import torch
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("No GPU available. Training will run on CPU.")
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

GPU: NVIDIA GeForce RTX 2070 SUPER is available.


In [16]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

# TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T
# TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)


pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, device=device)


In [26]:
messages = [
    {"role": "user", "content": "Do you want to build snowman?"}
]
generated_text = pipe(messages)[0]['generated_text']

In [29]:
print(generated_text[-1]['content'])

Yes, I want to build snowman!


RAG

![image.png](MarkDownImage//RAG.png)

In [5]:
import os
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores  import Chroma
from langchain.prompts import PromptTemplate
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.prompts import ChatPromptTemplate

model_kwargs = {'device': 'cuda'}
embedding_model_name = "sentence-transformers/all-MiniLM-L6-v2"
embedding = HuggingFaceEmbeddings(
    model_name=embedding_model_name,
    model_kwargs=model_kwargs
    )
    

pdf_folder = "documents/"

pdf_files = [f for f in os.listdir(pdf_folder) if f.endswith('.pdf')]

documents = []

for pdf_file in pdf_files:
    document = os.path.join(pdf_folder, pdf_file)
    pdf_loader = PyPDFLoader(document)
    documents.extend(pdf_loader.load())
    
vectordb= Chroma.from_documents(
    documents, 
    embedding,
    persist_directory="./knowledge-base"
)

In [39]:
def retrieve_documents(query, num_results=1):
    query_embedding = embedding.embed_query(query)
    results = vectordb.similarity_search_by_vector(query_embedding, k=num_results)
    retrieved_docs = [result.page_content for result in results]
    return retrieved_docs

def generate_answer(query, retrieved_docs):
    
    documents_text = "\n".join(retrieved_docs)
    
    prompt = f"""
    Given the following documents, answer the user's question as accurately as possible:

    Documents:
    {documents_text}

    Question:
    {query}

    Answer:
    """

    result = pipe(prompt, max_new_tokens=128, num_return_sequences=1)
    answer = result[0]['generated_text']
    
    return answer


def process_query(query, num_results=1):

    retrieved_docs = retrieve_documents(query, num_results=num_results)
    
    answer = generate_answer(query, retrieved_docs)
    
    return answer

query = "What are the main findings of the research?"
answer = process_query(query)
print(answer)

Token indices sequence length is longer than the specified maximum sequence length for this model (2499 > 2048). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (2048). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.
