# GPT end-to-end with Product data

In [1]:
!pip install --upgrade openai
!pip install jq
!pip install rouge

Collecting openai
  Using cached openai-1.13.3-py3-none-any.whl (227 kB)
Installing collected packages: openai
  Attempting uninstall: openai
    Found existing installation: openai 0.28.0
    Uninstalling openai-0.28.0:
      Successfully uninstalled openai-0.28.0
Successfully installed openai-1.13.3
Collecting rouge
  Using cached rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [2]:
import json
import nltk
nltk.download('punkt')
from nltk import word_tokenize
from rouge import Rouge
import numpy as np
from pathlib import Path
from pprint import pprint
import tiktoken
import os
from sys import displayhook
from openai import OpenAI

[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## utils

In [3]:
# Following QMSum
def tokenize(sent):
    tokens = ' '.join(word_tokenize(sent.lower()))
    return tokens

# For openai
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

#num_tokens_from_string(str, "cl100k_base")

# filter some noises caused by speech recognition
def clean_data(text):
    text = text.replace('{ vocalsound }', '')
    text = text.replace('{ disfmarker }', '')
    text = text.replace('a_m_i_', 'ami')
    text = text.replace('l_c_d_', 'lcd')
    text = text.replace('p_m_s', 'pms')
    text = text.replace('t_v_', 'tv')
    text = text.replace('{ pause }', '')
    text = text.replace('{ nonvocalsound }', '')
    text = text.replace('{ gap }', '')
    return text

In [4]:
client = OpenAI(api_key='') # Insert OpenAI API key

## Data processing

In [None]:
def process_file(file, path):
    f = os.path.join(path, file)
    with open(f, "r") as f:
            data = json.load(f)
    f.close()

    text = []
    for turn in data['meeting_transcripts']:
        cur_turn = turn['speaker'].lower() + ': '
        text.append(clean_data(' '.join(word_tokenize(cur_turn + turn['content'].lower()))))

    # make lists with queries
    topic_list = []
    general_query_list = []
    specific_query_list = []
    ref_list = []

    for i in data['topic_list']:
        for key, val in i.items():
            if key == "topic":
                    topic_list.append(val)
    for i in data['general_query_list']:
        for key, val in i.items():
            if key == "query":
                    general_query_list.append(val)
            if key == "answer": 
                    ref_list.append(tokenize(val))
    for i in data['specific_query_list']:
        for key, val in i.items():
            if key == "query":
                    specific_query_list.append(val)
            if key == "answer": 
                    ref_list.append(tokenize(val))

    return text, topic_list, general_query_list, specific_query_list, ref_list

## Querying the model

In [5]:
def ask(query: str, info,
    print_message: bool = False) -> str:
    #message = query_message(query, df, model=model, token_budget=token_budget)
   # if print_message:
    #    print(message)
    messages = [
        {"role": "system", "content": info},
        {"role": "user", "content": query},
    ]
    response = client.chat.completions.create(
        model="gpt-3.5-turbo-16k",
        messages=messages,
        temperature=0
    )
    response_message = response.choices[0].message.content
    return response_message

In [8]:
s_scores = []
r_scores = []
meeting_tok = []
n_tok_gen = []
all_hyp_ref = []
PATH = '../Data/QMSum/data/Product/test/' #change /test to /all to run on entire dataset

for file in os.listdir(PATH):
    if file != '.ipynb_checkpoints':
        text, topic_list, general_query_list, specific_query_list, ref_list = process_file(file, PATH)


        info = f"""You're the assistant during the process of designing a new remote control. Use the below meeting transcript to answer all questions. If the answer cannot be found, write "n/a"

        Transcript:
        \"\"\"
        {text}
        \"\"\"

        """

        general_answers = []
        for question in general_query_list:
            response = ask(question, info)
            general_answers.append(tokenize(response))

        specific_answers = []
        for question in specific_query_list:
            response = ask(question, info)
            specific_answers.append(tokenize(response))


        hyp_list = general_answers+specific_answers
        assert len(hyp_list) == len(ref_list)

        # Evaluating the outputs for each meeting transcript 
        # Summary evaluation
        data = []
        for i in range(len(hyp_list)):
            data.append({'hyp': hyp_list[i], 'ref': ref_list[i]})

        hyps, refs = map(list, zip(*[[d['hyp'], d['ref']] for d in data]))
        rouge = Rouge()

        scores = rouge.get_scores(hyps, refs)
        # or
        avg_scores = rouge.get_scores(hyps, refs, avg=True)

        s_scores += scores
        
        for line in data:
            all_hyp_ref.append(line)

        # Number of tokens in the context
        meeting_tok.append(num_tokens_from_string(info, "cl100k_base"))

        # Number of tokens generated
        gen_tok = []
        for hyp in hyp_list:
            gen_tok.append(num_tokens_from_string(hyp, "cl100k_base"))

        n_tok_gen.append(np.mean(gen_tok))

f = open("prod_gpt_output.txt", "w")
with open('prod_gpt_output.txt', 'w') as f:
    for line in all_hyp_ref:
        f.write(f"{line}\n")
f.close()

## Evaluation for entire dataset

In [9]:
def dict_mean(dict_list):
    mean_dict = {}
    for key in dict_list[0].keys():
        mean_dict[key] = sum(d[key] for d in dict_list) / len(dict_list)
    return mean_dict

Summary evaluation

In [10]:
#get averages for evaluation
rouge_1 = []
rouge_2 = []
rouge_l = []
for d in s_scores:
    rouge_1.append(d['rouge-1'])
    rouge_2.append(d['rouge-2'])
    rouge_l.append(d['rouge-l'])
    

print('rouge-1 summariser')
pprint(dict_mean(rouge_1))
print('rouge-2')
pprint(dict_mean(rouge_2))
print('rouge-l')
pprint(dict_mean(rouge_l))


rouge-1 summariser
{'f': 0.313803413591332, 'p': 0.3380606402683946, 'r': 0.33276109428684886}
rouge-2
{'f': 0.09844713702081342, 'p': 0.10663581083659154, 'r': 0.11016142824236123}
rouge-l
{'f': 0.2670658366640899, 'p': 0.28606474442514934, 'r': 0.28501079814009156}


Number of tokens

In [11]:
#print('average retrieved tokens: ', np.mean(n_tok_ret))
print('average generated tokens: ', np.mean(n_tok_gen))
print('average tokens in info (instructions + transcript): ', np.mean(meeting_tok))

average generated tokens:  89.77298534798534
average tokens in info (instructions + transcript):  9366.5
