In [13]:
!pip install -q faiss-cpu langchain ollama langchain-community langchain-core pypdf tiktoken langchain-groq langchain_cohere rank_bm25

In [16]:
import re
from langchain_groq import ChatGroq
from langchain.chains import RetrievalQA
from langchain_cohere import CohereRerank
from langchain.vectorstores import FAISS
from langchain_community.llms import Ollama
from langchain.prompts import PromptTemplate
from langchain.docstore.document import Document
from langchain.document_loaders import PyPDFLoader
from langchain.retrievers import EnsembleRetriever
from langchain.text_splitter import TokenTextSplitter
from langchain_community.retrievers import BM25Retriever
from langchain.chains.summarize import load_summarize_chain
from langchain_community.embeddings import OllamaEmbeddings
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever

In [6]:
llm=ChatGroq(
    temperature=0,
    model="llama3-70b-8192",
    api_key="" # Optional if not set as an environment variable
)

In [7]:
file_path = "/home/balamurali/QAGenerator/data/stats.pdf"
loader = PyPDFLoader(file_path)
data = loader.load()
question_gen = ""
for page in data:
    question_gen += page.page_content
    
splitter_ques_gen = TokenTextSplitter(model_name= "gpt-3.5-turbo",chunk_size= 10000,chunk_overlap = 200)
chunk_ques_gen = splitter_ques_gen.split_text(question_gen)

In [8]:
document_ques_gen = [Document(page_content = t) for t in chunk_ques_gen]
splitter_ans_gen = TokenTextSplitter(model_name = 'gpt-3.5-turbo',chunk_size = 1000,chunk_overlap = 100)
document_answer_gen = splitter_ans_gen.split_documents(document_ques_gen)

In [9]:
prompt_template = """
You are an expert bot at creating questions based on materials and documentation.
Your goal is to prepare the user for their exam and coding tests.
You do this by asking questions about the text below:

------------
{text}
------------

Create questions that will prepare the user for their tests.
Make sure not to lose any important information. Do not create MCQ. 

QUESTIONS:
"""
PROMPT_QUESTIONS = PromptTemplate(template=prompt_template, input_variables=['text'])
refine_template = ("""
You are an expert at creating practice questions based on material and documentation.
Your goal is to help the user prepare for a test.
We have received some practice questions to a certain extent: {existing_answer}.
We have the option to refine the existing questions or add new ones.
(only if necessary) with some more context below.
------------
{text}
------------

Given the new context, refine the original questions in English.
If the context is not helpful, please provide the original questions.

Provide me the questions in the form of list datatype.

QUESTIONS:
"""
)
REFINE_PROMPT_QUESTIONS = PromptTemplate(input_variables=["existing_answer", "text"],template=refine_template,)

In [10]:
ques_gen_chain = load_summarize_chain(llm = llm, chain_type = "refine", verbose = True, 
                                      question_prompt=PROMPT_QUESTIONS, refine_prompt=REFINE_PROMPT_QUESTIONS)
ques = ques_gen_chain.run(document_ques_gen)

  warn_deprecated(




[1m> Entering new RefineDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
You are an expert bot at creating questions based on materials and documentation.
Your goal is to prepare the user for their exam and coding tests.
You do this by asking questions about the text below:

------------
Importance
and
the
use
of
correlation
in
Statistics
Introduction
Correlation
is
a
statistical
measure
that
expresses
the
extent
to
which
two
variables
are
linearly
related.
It
is
a
common
tool
for
describing
simple
relationships
without
making
a
statement
about
cause
and
effect.
Correlation
coefficients
range
from
-1
to
1,
with
a
value
of
0
indicating
no
linear
relationship
between
the
two
variables,
a
value
of
1
indicating
a
perfect
positive
linear
relationship,
and
a
value
of
-1
indicating
a
perfect
negative
linear
relationship.
Correlation
is
important
in
statistics
because
it
can
be
used
to
1.
Identify
relationships
between
variables:


In [11]:
print(ques)

Here are the questions based on the provided text:

1. What is correlation in statistics, (define correlation)?

2. What is the range of correlation coefficients, and what do the values -1, 0, and 1 indicate?

3. What are the three main uses of correlation in statistics?

4. How can correlation be used to identify relationships between variables?

5. What is the difference between correlation and causation, and why is it important to understand this distinction?

6. Provide an example of how correlation can be used in business to make better decisions.

7. How can correlation be used in finance to build diversified portfolios that reduce risk?

8. What are some examples of how correlation is used in medicine to improve the prevention, diagnosis, and treatment of diseases?

9. How can correlation be used in psychology to develop better assessments and treatments?

10. Provide an example of how correlation is used in the real world, such as in marketing, finance, medicine, or psychology.

In [17]:
embeddings = OllamaEmbeddings(model='mxbai-embed-large')
vector_store = FAISS.from_documents(document_answer_gen, embeddings)
retriever = BM25Retriever.from_documents(document_answer_gen)
ensemble_retriever = EnsembleRetriever(retrievers=[retriever, vector_store.as_retriever()], weights=[0.5, 0.5])
compressor = CohereRerank(cohere_api_key="")
compression_retriever = ContextualCompressionRetriever(base_compressor=compressor, base_retriever=ensemble_retriever)

In [18]:
sentence_list=ques.replace("\n\n","\n").split("\n")
ques_list=[]
for sentence in sentence_list:
    if bool(re.search(r'\d', sentence)):
        ques_list.append(sentence)

In [19]:
ques_list

['1. What is correlation in statistics, (define correlation)?',
 '2. What is the range of correlation coefficients, and what do the values -1, 0, and 1 indicate?',
 '3. What are the three main uses of correlation in statistics?',
 '4. How can correlation be used to identify relationships between variables?',
 '5. What is the difference between correlation and causation, and why is it important to understand this distinction?',
 '6. Provide an example of how correlation can be used in business to make better decisions.',
 '7. How can correlation be used in finance to build diversified portfolios that reduce risk?',
 '8. What are some examples of how correlation is used in medicine to improve the prevention, diagnosis, and treatment of diseases?',
 '9. How can correlation be used in psychology to develop better assessments and treatments?',
 '10. Provide an example of how correlation is used in the real world, such as in marketing, finance, medicine, or psychology.',
 '11. What is the im

In [20]:
answer_generation_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=compression_retriever)

In [21]:
for question in ques_list:
    print("Question: ", question)
    answer = answer_generation_chain.run(question)
    print("Answer: ", answer)
    print("--------------------------------------------------\\n\\n")

Question:  1. What is correlation in statistics, (define correlation)?
Answer:  According to the provided context, correlation is defined as "a statistical measure that expresses the extent to which two variables are linearly related."
--------------------------------------------------\n\n
Question:  2. What is the range of correlation coefficients, and what do the values -1, 0, and 1 indicate?
Answer:  According to the text, correlation coefficients range from -1 to 1. The values -1, 0, and 1 indicate the following:

* A value of -1 indicates a perfect negative linear relationship between the two variables.
* A value of 0 indicates no linear relationship between the two variables.
* A value of 1 indicates a perfect positive linear relationship between the two variables.
--------------------------------------------------\n\n
Question:  3. What are the three main uses of correlation in statistics?
Answer:  According to the text, the three main uses of correlation in statistics are:

1. 