# LangChain+GPT with Product data

In [5]:
%pip install --upgrade --quiet langchain langchain-openai faiss-cpu tiktoken

Note: you may need to restart the kernel to use updated packages.


In [6]:
!pip install jq
!pip install rouge
!pip install langchain-community==0.0.24 # more recent versions throw errors

Collecting langchain-community==0.0.24
  Using cached langchain_community-0.0.24-py3-none-any.whl (1.7 MB)
Installing collected packages: langchain-community
  Attempting uninstall: langchain-community
    Found existing installation: langchain-community 0.0.25
    Uninstalling langchain-community-0.0.25:
      Successfully uninstalled langchain-community-0.0.25
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
langchain 0.1.10 requires langchain-community<0.1,>=0.0.25, but you have langchain-community 0.0.24 which is incompatible.[0m[31m
[0mSuccessfully installed langchain-community-0.0.24


In [8]:
from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.memory import ConversationBufferWindowMemory
import json
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from rouge import Rouge
import numpy as np
from pathlib import Path
from pprint import pprint
import tiktoken
import os


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Utils

In [9]:
# Following QMSum
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

# For openai
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

#num_tokens_from_string(str, "cl100k_base")

# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound }', '')
    text = text.replace('{ disfmarker }', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause }', '')
    text = text.replace('{ nonvocalsound }', '')
    text = text.replace('{ gap }', '')
    return text

## Data Processing

In [None]:
def process_file(file, path):
    f = os.path.join(path, file)
    with open(f, "r") as f:
            data = json.load(f)
    f.close()

    text = []
    for turn in data['meeting_transcripts']:
        cur_turn = turn['speaker'].lower() + ': '
        text.append(clean_data(' '.join(word_tokenize(cur_turn + turn['content'].lower()))))

    # make lists with queries
    topic_list = []
    general_query_list = []
    specific_query_list = []
    ref_list = []

    for i in data['topic_list']:
        for key, val in i.items():
            if key == "topic":
                    topic_list.append(val)
    for i in data['general_query_list']:
        for key, val in i.items():
            if key == "query":
                    general_query_list.append(val)
            if key == "answer": 
                    ref_list.append(tokenize(val))
    for i in data['specific_query_list']:
        for key, val in i.items():
            if key == "query":
                    specific_query_list.append(val)
            if key == "answer": 
                    ref_list.append(tokenize(val))

    return text, topic_list, general_query_list, specific_query_list, ref_list

## Creating the model

In [None]:
def create_model(text):
    # Split meeting transcript
    text_splitter = RecursiveCharacterTextSplitter()
    documents = text_splitter.create_documents(text)
    split_documents = text_splitter.split_documents(documents)

    # Embed split documents (One embedding per turn in conversation)
    embeddings=OpenAIEmbeddings(openai_api_key = '') # Insert OpenAI API key
    embedding_list = embeddings.embed_documents([document.page_content for document in split_documents])
    
    print(f"You have {len(embedding_list)} embeddings")
    print(f"Here's a sample of one: {embedding_list[0][:3]}...")

    # Store embeddings in FAISS vectorstore
    vectorstore = FAISS.from_documents(documents, embeddings)

    # Use vectorstore as retriever
    retriever = vectorstore.as_retriever(search_kwargs={"k": 20})

    # Config model
    model = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0.0, openai_api_key = '') # Insert OpenAI API key

    # Create prompt template
    template = """You're the assistant during the process of designing a new remote control. Use the below context from a meeting transcript to answer all questions."

    Context: {context}

    Question: {question}
    """

    prompt = ChatPromptTemplate.from_template(template)

    # Create LangChain chain
    chain = (
        {
            "context": itemgetter("question") | retriever,
            "question": itemgetter("question")
        }
        | prompt
        | model
        | StrOutputParser()
    )

    return chain, retriever

## Querying the model

In [None]:
s_scores = []
r_scores = []
n_tok_ret = []
n_tok_gen = []
all_hyps_and_refs = []
PATH = '../Data/QMSum/data/Product/test/' #change /test to /all to run on entire dataset

for file in os.listdir(PATH):
    if file.endswith('.json'):
        text, topic_list, general_query_list, specific_query_list, ref_list = process_file(file, PATH)

        chain, retriever = create_model(text)

        print(chain.invoke({"question": "What did the group discuss about remote control style and design optimization?"}))

        general_answers = []
        retrieved = []
        
        for question in general_query_list:
            response = chain.invoke({"question": question})
            docs = retriever.get_relevant_documents(question)
            question_sources = ''
            for doc in docs:
                question_sources+=(''.join(doc.page_content))
            retrieved.append(question_sources)
            general_answers.append(tokenize(response))

        specific_answers = []
        for question in specific_query_list:
            response = chain.invoke({"question": question})
            docs = retriever.get_relevant_documents(question)
            question_sources = ''
            for doc in docs:
                question_sources+=(''.join(doc.page_content))
            retrieved.append(question_sources)
            specific_answers.append(tokenize(response))

        hyp_list = general_answers+specific_answers
        assert len(hyp_list) == len(ref_list)

        # Evaluating the outputs for each meeting transcript 
        # Summary evaluation
        data = []
        for i in range(len(hyp_list)):
            data.append({'hyp': hyp_list[i], 'ref': ref_list[i]})
            all_hyps_and_refs.append({'hyp': hyp_list[i], 'ref': ref_list[i]})

        hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
        rouge = Rouge()

        scores = rouge.get_scores(hyps, refs)
        # or
        avg_scores = rouge.get_scores(hyps, refs, avg=True)

        s_scores += scores

        # Retriever evaluation
        tok_retrieved = [tokenize(k) for k in retrieved]
        data = []
        for i in range(len(ref_list)):
            data.append({'hyp': tok_retrieved[i], 'ref': ref_list[i]})

        hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
        rouge = Rouge()

        scores = rouge.get_scores(hyps, refs)
        # or
        avg_scores = rouge.get_scores(hyps, refs, avg=True)

        r_scores += scores

        # Number of tokens
        retrieved_tok = []
        gen_tok = []
        i = 0
        for topk in retrieved:
            retrieved_tok.append(num_tokens_from_string(topk, "cl100k_base"))
            gen_tok.append(num_tokens_from_string(hyp_list[i], "cl100k_base"))
            i += 1

        n_tok_ret.append(np.mean(retrieved_tok))
        n_tok_gen.append(np.mean(gen_tok))
        
f = open("prod_langchain_output.txt", "w")
with open('prod_langchain_output.txt', 'w') as f:
    for line in all_hyps_and_refs:
        f.write(f"{line}\n")
f.close()

## Evaluation for entire dataset

In [11]:
def dict_mean(dict_list):
    mean_dict = {}
    for key in dict_list[0].keys():
        mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)
    return mean_dict

Summary evaluation

In [12]:
#get averages for evaluation
rouge_1 = []
rouge_2 = []
rouge_l = []
for d in s_scores:
    rouge_1.append(d['rouge-1'])
    rouge_2.append(d['rouge-2'])
    rouge_l.append(d['rouge-l'])
    

print('rouge-1 summariser')
pprint(dict_mean(rouge_1))
print('rouge-2')
pprint(dict_mean(rouge_2))
print('rouge-l')
pprint(dict_mean(rouge_l))


rouge-1 summariser
{'f': 0.2948041544738696, 'p': 0.3493053466510047, 'r': 0.29319319530631666}
rouge-2
{'f': 0.0900664476607133, 'p': 0.10704261057110534, 'r': 0.09570058151497235}
rouge-l
{'f': 0.25260941313943397, 'p': 0.2970537819587526, 'r': 0.25299690129493096}


Retriever evaluation

In [13]:
rouge_1 = []
rouge_2 = []
rouge_l = []
for d in r_scores:
    rouge_1.append(d['rouge-1'])
    rouge_2.append(d['rouge-2'])
    rouge_l.append(d['rouge-l'])
    

print('rouge-1 retriever')
pprint(dict_mean(rouge_1))
print('rouge-2')
pprint(dict_mean(rouge_2))
print('rouge-l')
pprint(dict_mean(rouge_l))

rouge-1 retriever
{'f': 0.22198814856725982, 'p': 0.17029521280968482, 'r': 0.46547392144334776}
rouge-2
{'f': 0.04205691349921885, 'p': 0.02993006017291858, 'r': 0.13952224705720365}
rouge-l
{'f': 0.2023090612778814, 'p': 0.1552680683953529, 'r': 0.42675458413629785}


Number of tokens

In [14]:
print('average retrieved tokens: ', np.mean(n_tok_ret))
print('average generated tokens: ', np.mean(n_tok_gen))

average retrieved tokens:  513.2662408424909
average generated tokens:  70.37067307692308
