# LangChain+GPT with Aveni data

In [None]:
%pip install --quiet  langchain langchain-openai faiss-cpu tiktoken

In [None]:
!pip install jq
!pip install langchain-community==0.0.24 # more recent versions throw errors
!pip install rouge

In [None]:
from operator import itemgetter

from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda, RunnablePassthrough
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.memory import ConversationBufferWindowMemory
import json
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from rouge import Rouge
import numpy as np
from pathlib import Path
from pprint import pprint
import tiktoken
import os
from rouge import Rouge
import tiktoken
import numpy as np
import json

## Utils

In [None]:
# Following QMSum
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens


def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

#num_tokens_from_string(str, "cl100k_base")

# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound }', '')
    text = text.replace('{ disfmarker }', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause }', '')
    text = text.replace('{ nonvocalsound }', '')
    text = text.replace('{ gap }', '')
    return text

## Data loading

In [None]:
#extract only meeting content
with open("../Data/Aveni/all/annotated_demo_08_11.json", "r") as f:
     transcript = json.load(f)
f.close()

text = []
for turn in transcript['meeting_transcripts']:
    cur_turn = turn['speaker'].lower() + ': '
    text.append(clean_data(' '.join(word_tokenize(cur_turn + turn['content'].lower()))))

# print(text)


with open("../Data/Aveni/all/annotated_demo_08_11.json", "r") as demo:
     demo_queries = json.load(demo)

# make lists with queries
topic_list = []
general_query_list = []
specific_query_list = []

for i in demo_queries['topic_list']:
    for key, val in i.items():
        if key == "topic":
              topic_list.append(val)
for i in demo_queries['general_query_list']:
    for key, val in i.items():
        if key == "query":
              general_query_list.append(val)
for i in demo_queries['specific_query_list']:
    for key, val in i.items():
        if key == "query":
              specific_query_list.append(val)

## Creating the model

In [None]:
# Split meeting transcript
text_splitter = RecursiveCharacterTextSplitter()
documents = text_splitter.create_documents(text)
split_documents = text_splitter.split_documents(documents)

# Embed split documents (One embedding per turn in conversation)
embeddings=OpenAIEmbeddings(openai_api_key = '') # Insert OpenAI API key
embedding_list = embeddings.embed_documents([document.page_content for document in split_documents])
 
print(f"You have {len(embedding_list)} embeddings")
print(f"Here's a sample of one: {embedding_list[0][:3]}...")

# Store embeddings in FAISS vectorstore
vectorstore = FAISS.from_documents(documents, embeddings)

# Use vectorstore as retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 5})

# Config model
model = ChatOpenAI(model="gpt-3.5-turbo-16k", temperature=0.0, openai_api_key = '') # Insert OpenAI API key

# Create prompt template
template = """You're the assistant for a financial advisor. Use the below context from a meeting transcript to answer all questions. If the answer cannot be found, write "n/a"

Context: {context}

Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)

# Create LangChain chain
chain = (
    {
        "context": itemgetter("question") | retriever,
        "question": itemgetter("question")
    }
    | prompt
    | model
    | StrOutputParser()
)

In [None]:
# sanity check
chain.invoke({"question": "is the client risk adverse?"})

## Querying the model

In [None]:
general_answers = []
retrieved = []
for question in general_query_list:
    response = chain.invoke({"question": question})
    docs = retriever.get_relevant_documents(question)
    question_sources = ''
    for doc in docs:
        question_sources+=(''.join(doc.page_content))
    retrieved.append(question_sources)
    general_answers.append(tokenize(response))


In [None]:
specific_answers = []
for question in specific_query_list:
    response = chain.invoke({"question": question})
    docs = retriever.get_relevant_documents(question)
    question_sources = ''
    for doc in docs:
        question_sources+=(''.join(doc.page_content))
    retrieved.append(question_sources)
    specific_answers.append(tokenize(response))

In [None]:
# check
print((retrieved[0]))

## Evaluation

In [None]:
with open('../Data/Aveni/all/annotated_demo_08_11.json') as refs:
    refs_data = json.load(refs)

ref_list = []
for i in refs_data['general_query_list']:
        for key, val in i.items():
            if key == "answer": 
                ref_list.append(tokenize(val))
for i in refs_data['specific_query_list']:
        for key, val in i.items():
            if key == "answer": 
                ref_list.append(tokenize(val))
                
hyp_list = general_answers+specific_answers
assert len(hyp_list) == len(ref_list)

Summary evaluation

In [None]:
data = []
for i in range(len(hyp_list)):
    data.append({'hyp': hyp_list[i], 'ref': ref_list[i]})

hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()

scores = rouge.get_scores(hyps, refs)
# or
avg_scores = rouge.get_scores(hyps, refs, avg=True)

pprint(avg_scores)


# Write prediction and reference to file for qualitative evaluation
f = open("output.txt", "w")
with open('output.txt', 'w') as f:
    for line in data:
        f.write(f"{line}\n")
f.close()

Retriever evaluation

In [None]:
tok_retrieved = [tokenize(k) for k in retrieved]
print(tok_retrieved)

In [None]:
data = []
for i in range(len(ref_list)):
    data.append({'hyp': tok_retrieved[i], 'ref': ref_list[i]})
    
hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
rouge = Rouge()

scores = rouge.get_scores(hyps, refs)
# or
avg_scores = rouge.get_scores(hyps, refs, avg=True)

pprint(avg_scores)

Number of tokens

In [None]:
retrieved_tok = []
gen_tok = []
i = 0
for topk in retrieved:
    retrieved_tok.append(num_tokens_from_string(topk, "cl100k_base"))
    gen_tok.append(num_tokens_from_string(hyp_list[i], "cl100k_base"))
    i += 1
print('average retrieved tokens Langchain: ', np.mean(retrieved_tok))
print('average generated tokens GPT: ', np.mean(gen_tok))
