In [15]:
from operator import itemgetter

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough, RunnableLambda
from langchain.vectorstores import FAISS
from langchain.vectorstores.pgvector import PGVector
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.vectorstores import DocArrayInMemorySearch
import langchain
import supabase
from sentence_transformers import SentenceTransformer
from langchain.retrievers import BM25Retriever, EnsembleRetriever

from langchain.llms import OpenAI
import json




In [16]:
import os
import openai
from dotenv import load_dotenv

load_dotenv()

True

In [17]:
import vecs

DB_CONNECTION = "postgresql://postgres:supa-jupyteach@192.168.0.77:54328/postgres"

# create vector store client
vx = vecs.create_client(DB_CONNECTION)

In [18]:
# Loading sentence embedding model
embedding_model = SentenceTransformer('all-MiniLM-L6-v2') 

user_question = input("What is your question? ")

# Creating embedding for user's question
user_embedding = embedding_model.encode(user_question)

2023-11-08 18:57:02,476:INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2023-11-08 18:57:02,647:INFO - Use pytorch device: cpu


What is your question?  What did the professor say are the core reshaping operations in pandas andexplain how the unstack method works? 


Batches: 100%|██████████| 1/1 [00:00<00:00, 76.15it/s]


In [19]:
COLLECTION_NAME = "documents"

In [20]:
def get_vectorstore():
    embeddings = OpenAIEmbeddings()

    db = PGVector(embedding_function=embeddings,
        collection_name=COLLECTION_NAME,
        connection_string=DB_CONNECTION,
    )
    return db

In [21]:
vector_store = get_vectorstore()
retriever = vector_store.as_retriever()

In [17]:
db = get_vectorstore()
output = db.similarity_search_with_score(user_question)
with open("results.txt", "w") as f:
    for vector_id, similarity_score in output:
        f.write(f"{vector_id}\t{similarity_score}\n")

In [18]:
# db = get_vectorstore()
# docs_with_score = db.similarity_search_with_score(user_question)
# docs_with_score

In [22]:
from langchain.chat_models import ChatOpenAI
from langchain.schema import(
     AIMessage,
     HumanMessage,
     SystemMessage
)


In [23]:
from langchain import PromptTemplate
from langchain.chains import LLMChain
from langchain.chains.summarize import load_summarize_chain
from langchain.docstore.document import Document

In [247]:
with open('results.txt',encoding = 'utf-8') as f:
          text= f.read()
docs = [Document(page_content = text)]

llm = ChatOpenAI()

template = ''' Explain the concept like teaching a 6-year-old
TEXT: `{text}`
'''

prompt = PromptTemplate(
    input_variables = ['text'],
    template =  template
)        

In [248]:
chain = load_summarize_chain(
    llm,
    chain_type = 'stuff',
    prompt = prompt,
    verbose = True
)    

output_summary = chain.run(docs)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m Explain the concept like teaching a 6-year-old
TEXT: `page_content="So this will find us the minimum one, and the maximum one could be done just like it. And then unemployment.detite tells us float 64. So if we look at our unemployment series, just notice that the values themselves are float 64, so unemployment.detite is just telling us what kind of values are being stored inside of our series. Great. So now we've talked about what a series is, and we'll now talk about what a data frame is. So a data frame is just going to be how pandas will store multiple columns of data. You could think about data frames as simply just multiple series stacked side by side. You'll notice that we still have an index, and now the index is zero, we'll just call this column zero, one and two. So index zero is associated with the A in columns zero, the A in column one and 

In [249]:
print(output_summary)

A series is like a list of numbers, but each number has a special type called "float 64". This just means that the numbers can have a decimal point. 

A data frame is like a table with columns and rows. It's like having multiple series stacked side by side. Each column has a name, like "column zero", "column one", and "column two". And each row has an index, like "zero", "one", and "two". So you can think of a data frame as a way to store lots of numbers in a table format. 

On the next slide, we're going to make a data frame that shows the unemployment rate.


In [250]:
## Summarizing documents using map reduce

In [251]:
from langchain import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [23]:
with open('results.txt',encoding = 'utf-8') as f:
          text= f.read()

llm = ChatOpenAI()

In [253]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=50)
chunks = text_splitter.create_documents([text])


In [254]:
chain = load_summarize_chain(
    llm,
    chain_type = 'map_reduce',
    verbose = True
)
output_summary = chain.run(chunks)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a concise summary of the following:


"page_content="So this will find us the minimum one, and the maximum one could be done just like it. And then unemployment.detite tells us float 64. So if we look at our unemployment series, just notice that the values themselves are float 64, so unemployment.detite is just telling us what kind of values are being stored inside of our series. Great. So now we've talked about what a series is, and we'll now talk about what a data frame is. So a data frame is just going to be how pandas will store multiple columns of data. You could think about data frames as simply just multiple series stacked side by side. You'll notice that we still have an index, and now the index is zero, we'll just call this column zero, one and two. So index zero is associated with the A in columns zero, the A in column one and the A 

In [255]:
print(output_summary)

The passage explains the concepts of a series and a data frame in the pandas library. It describes a series as a one-dimensional labeled array and a data frame as a way to store multiple columns of data. The passage also discusses the relationship between index values and columns in a data frame. It concludes by mentioning the creation of a data frame for storing the unemployment rate.


In [256]:
chain.llm_chain.prompt.template

'Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:'

In [257]:
chain.combine_document_chain.llm_chain.prompt.template

'Write a concise summary of the following:\n\n\n"{text}"\n\n\nCONCISE SUMMARY:'

In [258]:
## map_reduce with custom prompts

In [259]:
map_prompt = '''
Explain in detail the concept mentioned in the text with an example:
TEXT: '{text}'
'''
map_prompt_template = PromptTemplate(
    input_variables = ['text'],
    template = map_prompt
    )   

In [260]:
combine_prompt = '''
Write a detailed explanation of the following text that covers the key points.
Add a title to the explanation.
Start the explanation with an INTRODUCTION PARAGRAPH that gives an overview  of the
topic FOLLOWED by BULLET POINTS giving a detailed overview with an example code if necessary AND end the explanation with a CONCLUSION
PHRASE.
TEXT: `{text}`
'''

combine_prompt_template = PromptTemplate(template = combine_prompt,
                                        input_variables = ['text'])

In [261]:
summary_chain = load_summarize_chain(
    llm = llm,
    chain_type = 'map_reduce',
    map_prompt = map_prompt_template,
    combine_prompt = combine_prompt_template,
    verbose = True
)

output = summary_chain.run(chunks)




[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
Explain in detail the concept mentioned in the text with an example:
TEXT: 'page_content="So this will find us the minimum one, and the maximum one could be done just like it. And then unemployment.detite tells us float 64. So if we look at our unemployment series, just notice that the values themselves are float 64, so unemployment.detite is just telling us what kind of values are being stored inside of our series. Great. So now we've talked about what a series is, and we'll now talk about what a data frame is. So a data frame is just going to be how pandas will store multiple columns of data. You could think about data frames as simply just multiple series stacked side by side. You'll notice that we still have an index, and now the index is zero, we'll just call this column zero, one and two. So index zero is associated with the A in columns zero

In [230]:
print(output)

Explanation: Using Pandas for Data Manipulation and Analysis

Introduction:
The concept mentioned in the text is the use of pandas, a Python library, to store and manipulate data using two main data structures: series and data frames. Pandas provides powerful functionalities for data analysis and manipulation tasks.

Key Points:
1. Series:
   - A series is a one-dimensional labeled array in pandas.
   - Similar to a column in a spreadsheet or a database table.
   - Can store any data type.
   - Each value in the series has an associated index for referencing and accessing specific values.
   - Example code: `unemployment = pd.Series([3.2, 4.1, 3.8, 5.0])`

2. Data Frames:
   - A data frame is a two-dimensional labeled data structure in pandas.
   - Can be thought of as a table with rows and columns.
   - Each column in a data frame is a series.
   - Used to store and manipulate multiple columns of data.
   - The index in a data frame represents the row labels, and each column has its o

In [185]:
# Refine with custom prompts

In [24]:
prompt_template = """ Write a concise summary of the following extracting the key
information: 
Text:`{text}`
CONCISE SUMMARY:""" 
initial_prompt = PromptTemplate(template = prompt_template,input_variables=['text'])

refine_template = '''
Your job is to produce a final summary.
I have provided an existing summary upto a certain point: {existing_answer}
Please refine the existing summary with some more context below. 
---------------
{text}
---------------
Start the final summary with an INTRODUCTION PARAGRAPH that gives an overview 
of the topic FOLLOWED by BULLET POINTS if possible AND  end the summary 
with a CONCLUSION PHRASE.
'''
refine_prompt = PromptTemplate(template=refine_template,
                               input_variables=['existing_answer','text']
                              )

In [25]:
llm = ChatOpenAI()

In [26]:
chain = load_summarize_chain(
    llm = llm,
    chain_type = 'refine',
    question_prompt = initial_prompt,
    refine_prompt = refine_prompt,
    return_intermediate_steps = False
)


In [27]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=50)
chunks = text_splitter.create_documents([text])


NameError: name 'text' is not defined

In [None]:
output_summary = chain.run(chunks)



In [None]:
print(output_summary)