In [None]:
# https://github.com/mendableai/QA_clustering/blob/main/notebooks/clustering_approach.ipynb

In [1]:
import os
import ast
import pandas as pd 
from langchain import LLMChain
from langchain_community.chat_models import ChatOllama
from langchain.prompts import PromptTemplate
from langchain.chains.mapreduce import MapReduceChain
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import (
                StuffDocumentsChain,
                LLMChain,
                ReduceDocumentsChain,
                MapReduceDocumentsChain,
            )
import nest_asyncio
import phoenix as px
from phoenix.session.evaluation import get_qa_with_reference, get_retrieved_documents
from phoenix.trace.langchain import LangChainInstrumentor


nest_asyncio.apply()

In [2]:
session = px.launch_app()

🌍 To view the Phoenix app in your browser, visit http://localhost:6006/
📖 For more information on how to use Phoenix, check out https://docs.arize.com/phoenix


In [3]:
llm = ChatOllama(model='phi3', temperature=0.0)

In [4]:
LangChainInstrumentor().instrument()

In [5]:
# Read data
df = pd.read_csv('data/mock_data.csv')
df.head()

Unnamed: 0,messages
0,"['What is the purpose of LangChain?','How does..."
1,['Can I use LangChain for natural language und...
2,"['How do I install LangChain on Windows?','Are..."
3,['Can I integrate LangChain with my existing N...
4,"['Is LangChain compatible with TensorFlow?','C..."


In [6]:
df['messages_unpack'] = df['messages'].apply(lambda x: ast.literal_eval(x))
df['message_first'] = df['messages_unpack'].apply(lambda x: x[0])
df.head()

Unnamed: 0,messages,messages_unpack,message_first
0,"['What is the purpose of LangChain?','How does...","[What is the purpose of LangChain?, How does L...",What is the purpose of LangChain?
1,['Can I use LangChain for natural language und...,[Can I use LangChain for natural language unde...,Can I use LangChain for natural language under...
2,"['How do I install LangChain on Windows?','Are...","[How do I install LangChain on Windows?, Are t...",How do I install LangChain on Windows?
3,['Can I integrate LangChain with my existing N...,[Can I integrate LangChain with my existing NL...,Can I integrate LangChain with my existing NLP...
4,"['Is LangChain compatible with TensorFlow?','C...","[Is LangChain compatible with TensorFlow?, Can...",Is LangChain compatible with TensorFlow?


In [11]:
# Flatten the list of lists and convert to lowercase
combined_messages = [msg.lower() for sublist in df['messages_unpack'] for msg in sublist]
first_messages = [msg.lower() for msg in df['message_first']]

# Remove duplicates without preserving order 
unique_messages = list(set(combined_messages))
unique_messages_first = list(set(first_messages))

# Count total characters and estimate tokens
total_chars = sum(len(question) for question in unique_messages)
total_chars_first = sum(len(question) for question in unique_messages_first)

# Log
print("Total questions: ", len(unique_messages))
print("Total first questions: ", len(unique_messages_first))
print("Total characters: ", total_chars)
print("Total tokens: ", total_chars/4)
print("Total characters first message: ", total_chars_first)
print("Total tokens first message: ", total_chars_first/4)

# Remove questions
remove_strings=["what is a prompt template?","how to cache llm calls?"]
filtered_unique_messages_first = [s for s in unique_messages_first if s not in remove_strings]
print("Total first questions final: ", len(filtered_unique_messages_first))

Total questions:  40
Total first questions:  20
Total characters:  2101
Total tokens:  525.25
Total characters first message:  1069
Total tokens first message:  267.25
Total first questions final:  20


In [12]:
from langchain.text_splitter import CharacterTextSplitter
input_doc = '\n\n'.join(filtered_unique_messages_first)

text_splitter = CharacterTextSplitter.from_tiktoken_encoder(chunk_size=4000,chunk_overlap=0,separator="\n\n")
# Sanity check
len(text_splitter.split_text(input_doc))

1

In [20]:
map_template_string = """The following is a list of questions, commands, and keyords that have been entered into a Q+A system:\n
{questions}

Based on this list of questions, please do 3 things: 
(1) identify the main themes 
(2) give a representitive example question in each theme
(3) estimate the proportion of questions that fall into each theme

Helpful Answer:"""

reduce_template_string = template = """The following is a list of summaries for questions entered into a Q+A system:
{question_summaries}

Take these and distill it into a final, consolidated list with: 
(1) the main question themes 
(2) two represntitive example questions in each theme
(3) estimate the proportion of questions that fall into each theme

Helpful Answer:"""

MAP_PROMPT = PromptTemplate(input_variables=["questions"], template=map_template_string)
REDUCE_PROMPT = PromptTemplate(input_variables=["question_summaries"], template=reduce_template_string)

In [21]:
# Map Reduce Chain
def run_mr(input_doc,MAP_PROMPT,REDUCE_PROMPT):
    
    # Use `GPT3.5-Turbo-16k` or Claude-2 for map
    llm_map = llm
    map_llm_chain = LLMChain(llm=llm_map, prompt=MAP_PROMPT)

    llm_reduce = llm
    # llm = ChatOpenAI(model_name="gpt-4", temperature=0)
    reduce_llm_chain = LLMChain(llm=llm_reduce, prompt=REDUCE_PROMPT)

    # Takes a list of documents and combines them into a single string
    combine_documents_chain = StuffDocumentsChain(
            llm_chain=reduce_llm_chain,
            document_variable_name="question_summaries")
    
    # Combines and iteravely reduces the mapped documents 
    reduce_documents_chain = ReduceDocumentsChain(
        # This is final chain that is called.
        combine_documents_chain=combine_documents_chain,
        # If documents exceed context for `combine_documents_chain`
        collapse_documents_chain=combine_documents_chain,
        # The maximum number of tokens to group documents into.
        token_max=4000)

    # Combining documents by mapping a chain over them, then combining results
    combine_documents = MapReduceDocumentsChain(
        # Map chain
        llm_chain=map_llm_chain,
        # Reduce chain
        reduce_documents_chain=reduce_documents_chain,
        # The variable name in the llm_chain to put the documents in
        document_variable_name="questions",
        # Return the results of the map steps in the output
        ### Bug: this currently does not work ###
        return_intermediate_steps=False)
        
    # Define Map=Reduce
    map_reduce = MapReduceChain(
        # Chain to combine documents
        combine_documents_chain=combine_documents,
        # Splitter to use for initial split
        text_splitter=text_splitter)
    
    return map_reduce.run(input_text=input_doc)

In [22]:
# Run 
result=run_mr(input_doc,MAP_PROMPT,REDUCE_PROMPT)

In [None]:
# Deep Dive

In [23]:
reduce_template_string = template = """The following is a list of summaries for questions entered into a Q+A system:\n
{question_summaries}

Take these and distill it into a final, consolidated list with: 
(1) the top 10 question related to loading, processing, and manipulating different types of data and documents.
(2) estimate the proportion of each question

Helpful Answer:"""
REDUCE_PROMPT = PromptTemplate(input_variables=["question_summaries"], template=reduce_template_string)

In [24]:
# Run 
result=run_mr(input_doc,MAP_PROMPT,REDUCE_PROMPT)