In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores.chroma import Chroma
from langchain.chains import ConversationalRetrievalChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_community.llms.ollama import Ollama

In [2]:
loader = PyPDFLoader('/home/aravind-pt7506/college/project/ollama-examples/gcse/scientific papers/papers/2402.00905.pdf')
document = loader.load()

In [3]:
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(document)

In [4]:
embeddings = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm
.gitattributes: 100%|██████████| 1.18k/1.18k [00:00<00:00, 8.93MB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 875kB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 16.3MB/s]
config.json: 100%|██████████| 571/571 [00:00<00:00, 4.21MB/s]
config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 713kB/s]
data_config.json: 100%|██████████| 39.3k/39.3k [00:00<00:00, 6.35MB/s]
pytorch_model.bin: 100%|██████████| 438M/438M [02:15<00:00, 3.22MB/s] 
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 450kB/s]
special_tokens_map.json: 100%|██████████| 239/239 [00:00<00:00, 1.77MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.08MB/s]
tokenizer_config.json: 100%|██████████| 363/363 [00:00<00:00, 2.13MB/s]
train_script.py: 100%|██████████| 13.1k/13.1k [00:00<00:00, 41.9MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 4.12MB/s]
modules.json: 100%|██████████| 349/34

In [5]:
db = Chroma.from_documents(texts, embeddings)
retriever = db.as_retriever(search_kwargs={'k': 2})

In [6]:
llm = Ollama(model="mistral:7b-instruct-v0.2-q4_0")

In [7]:
qa_chain = ConversationalRetrievalChain.from_llm(llm, retriever,return_source_documents=True)

In [11]:
query = "Summarize the whole document with key points and actual technical words. include any formulas and tables and images deemed necessary"
chat_history = []
res = qa_chain({'question': query, 'chat_history': chat_history})
print(res)

# import sys
# chat_history = []
# while True:
#     query = input('Prompt: ')
#     #To exit: use 'exit', 'quit', 'q', or Ctrl-D.",
#     if query.lower() in ["exit", "quit", "q"]:
#         print('Exiting')
#         sys.exit()
#     result = qa_chain({'question': query, 'chat_history': chat_history})
#     print('Answer: ' + result['answer'] + '\n')
#     chat_history.append((query, result['answer']))

{'question': 'Summarize the whole document with key points and actual technical words. include any formulas and tables and images deemed necessary', 'chat_history': [], 'answer': ' Title: Improving Code Quality using GPT-3.5 for Zero-Shot and Few-Shot Learning\n\nKey Points:\n1. The study explores the application of GPT-3.5, a large language model from OpenAI, to improve software code quality through zero-shot and few-shot learning.\n2. The authors provide prompt templates for both zero-shot and few-shot learning, which consist of persona, instruction, input (submitted code and reviewer comment), and output (improved code).\n3. For zero-shot learning, the model is given a single input prompt consisting of the submitted code with a reviewer comment if available, and its task is to generate improved code based on that.\n4. For few-shot learning, the model is provided with three example pairs of input (submitted code with a reviewer comment) and output (improved code), and its task is to 

In [12]:
res

{'question': 'Summarize the whole document with key points and actual technical words. include any formulas and tables and images deemed necessary',
 'chat_history': [],
 'answer': ' Title: Improving Code Quality using GPT-3.5 for Zero-Shot and Few-Shot Learning\n\nKey Points:\n1. The study explores the application of GPT-3.5, a large language model from OpenAI, to improve software code quality through zero-shot and few-shot learning.\n2. The authors provide prompt templates for both zero-shot and few-shot learning, which consist of persona, instruction, input (submitted code and reviewer comment), and output (improved code).\n3. For zero-shot learning, the model is given a single input prompt consisting of the submitted code with a reviewer comment if available, and its task is to generate improved code based on that.\n4. For few-shot learning, the model is provided with three example pairs of input (submitted code with a reviewer comment) and output (improved code), and its task is t

In [14]:
print(res['answer'])

 Title: Improving Code Quality using GPT-3.5 for Zero-Shot and Few-Shot Learning

Key Points:
1. The study explores the application of GPT-3.5, a large language model from OpenAI, to improve software code quality through zero-shot and few-shot learning.
2. The authors provide prompt templates for both zero-shot and few-shot learning, which consist of persona, instruction, input (submitted code and reviewer comment), and output (improved code).
3. For zero-shot learning, the model is given a single input prompt consisting of the submitted code with a reviewer comment if available, and its task is to generate improved code based on that.
4. For few-shot learning, the model is provided with three example pairs of input (submitted code with a reviewer comment) and output (improved code), and its task is to learn from these examples and generate improved code for new inputs based on that knowledge.
5. The authors use BM25 from the gensim package for selecting relevant examples to help the m