In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import PyPDF2
from sentence_transformers import SentenceTransformer
import torch

model_name = "Qwen/Qwen2.5-7B"
CHECKPOINT_DIR = "qwen2.5_7B_sft/checkpoint-1-8000"


tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    CHECKPOINT_DIR,
    trust_remote_code=True,
    device_map="auto"
)



Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

In [32]:
eos_token_id = tokenizer.eos_token_id

file_path = 'halterman17.pdf' #halterman17
text = ""
with open(file_path, 'rb') as f:
    reader = PyPDF2.PdfReader(f)
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text + "\n"

def chunk_text(text, chunk_size=100, overlap=30):
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = words[start : start + chunk_size]
        chunks.append(" ".join(chunk))
        start += (chunk_size - overlap)
    return chunks

chunks = chunk_text(text)

embedder = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)

def retrieve_chunks(query, chunks, chunk_embeddings, top_k=1):
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = torch.nn.functional.cosine_similarity(query_embedding, chunk_embeddings)
    top_results = torch.topk(cos_scores, k=top_k)
    top_chunks = [chunks[idx] for idx in top_results.indices]
    return top_chunks

def generate_answer(query, max_new_tokens=200):
    retrieved_chunks = retrieve_chunks(query, chunks, chunk_embeddings, top_k=1)
    context = "\n\n".join(retrieved_chunks)
    prompt = f"<|im_start|>user\n{query}<|im_end|>\n<|im_start|>context\n{context}<|im_end|>\n<|im_start|>assistant\n"
    # print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.3)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # answer = answer.split("Question: ")[1]
    return answer

# query = "whats name of the model? How many parameters its use?"
# answer = generate_answer(query)
# print("\nGenerated Answer:")
# print(answer)


In [35]:
query = "Write a function length_of_last_word(text) that takes as an argument a string text, and returns the length of the last word in the string. You may assume text is a non-empty English text where words are separated by whitespace characters (including \n \r \t \f and spaces). There is no need to handle punctuations differently from other characters."
answer = generate_answer(query)
# print("\nGenerated Answer:")
print(answer)

user
Write a function length_of_last_word(text) that takes as an argument a string text, and returns the length of the last word in the string. You may assume text is a non-empty English text where words are separated by whitespace characters (including 
 	 
 and spaces). There is no need to handle punctuations differently from other characters.
context
it in a different way. The following shows the preferred way of determining a string’s length: >>> s /quotesingle.VarABCEFGHI /quotesingle.Var >>> s = /quotesingle.VarABCEFGHI /quotesingle.Var >>> s /quotesingle.VarABCEFGHI /quotesingle.Var >>> len(s) 8 >>> s.__len__() 8 The expressions len(s) ands.__len__() are functionally equivalent. Instead of calling the __len__ method directly, clients should use the global lenfunction. Listing 9.4 ( printcharacters.py ) uses the len function and []index operator to print the individual characters that make up a string. Listing 9.4: printcharacters.py s = /quotedbl.VarABCDEFGHIJK/quotedbl.Var pri

In [None]:
import os
import pickle
from transformers import AutoTokenizer, AutoModelForCausalLM
import PyPDF2
from sentence_transformers import SentenceTransformer
import torch

model_name = "Qwen/Qwen2.5-3B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,
    device_map="auto"
)
eos_token_id = tokenizer.eos_token_id

CACHE_FILE = 'tokenized_medium.pkl'
CHUNK_SIZE = 500
OVERLAP = 50


def load_or_tokenize_pdf(pdf_path: str, cache_path: str = CACHE_FILE, chunk_size: int = CHUNK_SIZE, overlap: int = OVERLAP):
    if os.path.exists(cache_path):
        with open(cache_path, 'rb') as f:
            chunks = pickle.load(f)
        print(f"Loaded {len(chunks)} chunks from cache '{cache_path}'")
    
    else:
        text = ""
        with open(pdf_path, 'rb') as f:
            reader = PyPDF2.PdfReader(f)
            for page in reader.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        words = text.split()
        chunks = []
        start = 0
        while start < len(words):
            chunk = words[start : start + chunk_size]
            chunks.append(" ".join(chunk))
            start += (chunk_size - overlap)

        with open(cache_path, 'wb') as f:
            pickle.dump(chunks, f)
        print(f"Tokenized and saved {len(chunks)} chunks to cache '{cache_path}'")
    return chunks

chunks = load_or_tokenize_pdf('medium.pdf')

embedder = SentenceTransformer('all-MiniLM-L6-v2')
chunk_embeddings = embedder.encode(chunks, convert_to_tensor=True)


def retrieve_chunks(query: str, chunks: list, chunk_embeddings: torch.Tensor, top_k: int = 5) -> list:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    cos_scores = torch.nn.functional.cosine_similarity(query_embedding, chunk_embeddings)
    top_results = torch.topk(cos_scores, k=top_k)
    return [chunks[idx] for idx in top_results.indices]


def generate_answer(query, max_new_tokens=200):
    retrieved_chunks = retrieve_chunks(query, chunks, chunk_embeddings, top_k=3)
    context = "\n\n".join(retrieved_chunks)
    prompt = f"<|im_start|>user\n{query}<|im_end|>\n<|im_start|>context\n{context}<|im_end|>\n<|im_start|>assistant\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # answer = answer.split("Question: ")[1]
    return answer

if __name__ == '__main__':
    query = "What's the name of the model? How many parameters does it use?"
    answer = generate_answer(query)
    print("\nGenerated Answer:")
    print(answer)
