In [1]:
import os

import arxiv
import ast
import concurrent
from csv import writer
from IPython.display import display, Markdown, Latex
import json
import openai
import os
import pandas as pd
from PyPDF2 import PdfReader
import requests
from scipy import spatial
from tenacity import retry, wait_random_exponential, stop_after_attempt
import tiktoken
from tqdm import tqdm
from termcolor import colored
from dotenv import load_dotenv

load_dotenv()

openai.api_key = os.getenv("OPENAI_API_KEY")
openai.api_base = os.getenv("OPENAI_API_BASE") 
openai.api_type = 'azure'
openai.api_version = '2023-07-01-preview'

GPT_MODEL = "gpt-35-16k"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [2]:
directory = './data/papers'

# Check if the directory already exists
if not os.path.exists(directory):
    # If the directory doesn't exist, create it and any necessary intermediate directories
    os.makedirs(directory)
    print(f"Directory '{directory}' created successfully.")
else:
    # If the directory already exists, print a message indicating it
    print(f"Directory '{directory}' already exists.")

Directory './data/papers' already exists.


In [3]:
# Set a directory to store downloaded papers
data_dir = os.path.join(os.curdir, "data", "papers")
paper_dir_filepath = "./data/arxiv_library.csv"

# Generate a blank dataframe where we can store downloaded files
df = pd.DataFrame(list())
df.to_csv(paper_dir_filepath)

In [4]:
@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def embedding_request(text):
    response = openai.Embedding.create(input=text, engine=EMBEDDING_MODEL)
    return response


def get_articles(query, library=paper_dir_filepath, top_k=5):
    """This function gets the top_k articles based on a user's query, sorted by relevance.
    It also downloads the files and stores them in arxiv_library.csv to be retrieved by the read_article_and_summarize.
    """
    search = arxiv.Search(
        query=query, max_results=top_k, sort_by=arxiv.SortCriterion.Relevance
    )
    result_list = []
    for result in search.results():
        result_dict = {}
        result_dict.update({"title": result.title})
        result_dict.update({"summary": result.summary})

        # Taking the first url provided
        result_dict.update({"article_url": [x.href for x in result.links][0]})
        result_dict.update({"pdf_url": [x.href for x in result.links][1]})
        result_list.append(result_dict)

        # Store references in library file
        response = embedding_request(text=result.title)
        file_reference = [
            result.title,
            result.download_pdf(data_dir),
            response["data"][0]["embedding"],
        ]

        # Write to file
        with open(library, "a") as f_object:
            writer_object = writer(f_object)
            writer_object.writerow(file_reference)
            f_object.close()
    return result_list



In [5]:
# Test that the search is working
result_output = get_articles("GPT Transformer Orignal Papers")
result_output[0]

{'title': 'Beyond Generating Code: Evaluating GPT on a Data Visualization Course',
 'summary': "This paper presents an empirical evaluation of the performance of the\nGenerative Pre-trained Transformer (GPT) model in Harvard's CS171 data\nvisualization course. While previous studies have focused on GPT's ability to\ngenerate code for visualizations, this study goes beyond code generation to\nevaluate GPT's abilities in various visualization tasks, such as data\ninterpretation, visualization design, visual data exploration, and insight\ncommunication. The evaluation utilized GPT-3.5 and GPT-4 to complete\nassignments of CS171, and included a quantitative assessment based on the\nestablished course rubrics, a qualitative analysis informed by the feedback of\nthree experienced graders, and an exploratory study of GPT's capabilities in\ncompleting border visualization tasks. Findings show that GPT-4 scored 80% on\nquizzes and homework, and TFs could distinguish between GPT- and\nhuman-gene

In [6]:
str(result_output)

'[{\'title\': \'Beyond Generating Code: Evaluating GPT on a Data Visualization Course\', \'summary\': "This paper presents an empirical evaluation of the performance of the\\nGenerative Pre-trained Transformer (GPT) model in Harvard\'s CS171 data\\nvisualization course. While previous studies have focused on GPT\'s ability to\\ngenerate code for visualizations, this study goes beyond code generation to\\nevaluate GPT\'s abilities in various visualization tasks, such as data\\ninterpretation, visualization design, visual data exploration, and insight\\ncommunication. The evaluation utilized GPT-3.5 and GPT-4 to complete\\nassignments of CS171, and included a quantitative assessment based on the\\nestablished course rubrics, a qualitative analysis informed by the feedback of\\nthree experienced graders, and an exploratory study of GPT\'s capabilities in\\ncompleting border visualization tasks. Findings show that GPT-4 scored 80% on\\nquizzes and homework, and TFs could distinguish betwee

In [7]:
def strings_ranked_by_relatedness(
    query: str,
    df: pd.DataFrame,
    relatedness_fn=lambda x, y: 1 - spatial.distance.cosine(x, y),
    top_n: int = 100,
) -> list[str]:
    """Returns a list of strings and relatednesses, sorted from most related to least."""
    query_embedding_response = embedding_request(query)
    query_embedding = query_embedding_response["data"][0]["embedding"]
    strings_and_relatednesses = [
        (row["filepath"], relatedness_fn(query_embedding, row["embedding"]))
        for i, row in df.iterrows()
    ]
    strings_and_relatednesses.sort(key=lambda x: x[1], reverse=True)
    strings, relatednesses = zip(*strings_and_relatednesses)
    return strings[:top_n]

In [8]:
def read_pdf(filepath):
    """Takes a filepath to a PDF and returns a string of the PDF's contents"""
    # creating a pdf reader object
    reader = PdfReader(filepath)
    pdf_text = ""
    page_number = 0
    for page in reader.pages:
        page_number += 1
        pdf_text += page.extract_text() + f"\nPage Number: {page_number}"
    return pdf_text


# Split a text into smaller chunks of size n, preferably ending at the end of a sentence
def create_chunks(text, n, tokenizer):
    """Returns successive n-sized chunks from provided text."""
    tokens = tokenizer.encode(text)
    i = 0
    while i < len(tokens):
        # Find the nearest end of sentence within a range of 0.5 * n and 1.5 * n tokens
        j = min(i + int(1.5 * n), len(tokens))
        while j > i + int(0.5 * n):
            # Decode the tokens and check for full stop or newline
            chunk = tokenizer.decode(tokens[i:j])
            if chunk.endswith(".") or chunk.endswith("\n"):
                break
            j -= 1
        # If no end of sentence found, use n tokens as the chunk size
        if j == i + int(0.5 * n):
            j = min(i + n, len(tokens))
        yield tokens[i:j]
        i = j


def extract_chunk(content, template_prompt):
    """This function applies a prompt to some input content. In this case it returns a summarized chunk of text"""
    prompt = template_prompt + content
    response = openai.ChatCompletion.create(
        engine=GPT_MODEL, messages=[{"role": "user", "content": prompt}], temperature=0
    )
    return response["choices"][0]["message"]["content"]


def summarize_text(query):
    """This function does the following:
    - Reads in the arxiv_library.csv file in including the embeddings
    - Finds the closest file to the user's query
    - Scrapes the text out of the file and chunks it
    - Summarizes each chunk in parallel
    - Does one final summary and returns this to the user"""

    # A prompt to dictate how the recursive summarizations should approach the input paper
    summary_prompt = """Summarize this text from an academic paper. Extract any key points with reasoning.\n\nContent:"""

    # If the library is empty (no searches have been performed yet), we perform one and download the results
    library_df = pd.read_csv(paper_dir_filepath).reset_index()
    if len(library_df) == 0:
        print("No papers searched yet, downloading first.")
        get_articles(query)
        print("Papers downloaded, continuing")
        library_df = pd.read_csv(paper_dir_filepath).reset_index()
    library_df.columns = ["title", "filepath", "embedding"]
    library_df["embedding"] = library_df["embedding"].apply(ast.literal_eval)
    strings = strings_ranked_by_relatedness(query, library_df, top_n=2)
    print("Chunking text from paper")
    pdf_text = read_pdf(strings[0])

    # Initialise tokenizer
    tokenizer = tiktoken.get_encoding("cl100k_base")
    results = ""

    # Chunk up the document into 1500 token chunks
    chunks = create_chunks(pdf_text, 1500, tokenizer)
    text_chunks = [tokenizer.decode(chunk) for chunk in chunks]
    print("Summarizing each chunk of text")

    # Parallel process the summaries
    with concurrent.futures.ThreadPoolExecutor(
        max_workers=len(text_chunks)
    ) as executor:
        futures = [
            executor.submit(extract_chunk, chunk, summary_prompt)
            for chunk in text_chunks
        ]
        with tqdm(total=len(text_chunks)) as pbar:
            for _ in concurrent.futures.as_completed(futures):
                pbar.update(1)
        for future in futures:
            data = future.result()
            results += data

    # Final summary
    print("Summarizing into overall summary")
    response = openai.ChatCompletion.create(
        engine=GPT_MODEL,
        messages=[
            {
                "role": "user",
                "content": f"""Write a summary collated from this collection of key points extracted from an academic paper.
                        The summary should highlight the core argument, conclusions and evidence, and answer the user's query.
                        User query: {query}
                        The summary should be structured in bulleted lists following the headings Core Argument, Evidence, and Conclusions.
                        Key points:\n{results}\nSummary:\n""",
            }
        ],
        temperature=0,
    )
    return response


In [9]:
# Test the summarize_text function works
chat_test_response = summarize_text("GPT working principles")

Chunking text from paper
Summarizing each chunk of text


100%|██████████| 7/7 [00:36<00:00,  5.22s/it]


Summarizing into overall summary


In [10]:
print(chat_test_response["choices"][0]["message"]["content"])

Core Argument:
This academic paper explores the potential of using GPT-based agents in strategic game experiments, specifically the ultimatum game and the prisoner's dilemma game. The author develops prompts to enable GPT agents to understand and play the games. The results indicate that GPT can generate realistic outcomes and exhibit behavior consistent with human behavior in certain aspects. However, there are differences between GPT and human behavior, particularly in the evolution of choices over rounds.

Evidence:
- The author conducted experiments using GPT agents in the ultimatum and prisoner's dilemma games.
- GPT agents exhibited behavior consistent with human behavior in certain aspects, but there were differences, especially in the evolution of choices over rounds.
- The presence of social preference prompts had treatment effects on the behavior of GPT agents in both games.
- GPT agents had difficulty with strategic reasoning and understanding the game structure without spec

In [18]:
# @retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
# def chat_completion_request(messages, functions=None, model="gpt-3.5-turbo-0613"):
#     headers = {
#         "Content-Type": "application/json",
#         "Authorization": "Bearer " + os.getenv("OPENAI_KEY"),
#     }
#     json_data = {"model": model, "messages": messages}
#     if functions is not None:
#         json_data.update({"functions": functions})
#     try:
#         response = requests.post(
#             "https://api.openai.com/v1/chat/completions",
#             headers=headers,
#             json=json_data,
#         )
#         return response
#     except Exception as e:
#         print("Unable to generate ChatCompletion response")
#         print(f"Exception: {e}")
#         return e

# @retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
# def chat_completion_request(messages, functions=None, model=GPT_MODEL):
#     headers = {
#         "Content-Type": "application/json",
#         "Authorization": "Bearer " + os.getenv("OPENAI_KEY"),
#         # "api-key": openai.api_key,
#     }
#     json_data = {"model": model, "messages": messages}
#     if functions is not None:
#         json_data.update({"functions": functions})
#     try:
#         response = requests.post(
#             "https://api.openai.com/v1/chat/completions",
#             # "https://gptvivi.openai.azure.com/openai/deployments/gpt-35-16k/chat/completions?api-version=2023-07-01-preview",
#             headers=headers,
#             json=json_data,
#         )
#         return response
#     except Exception as e:
#         print("Unable to generate ChatCompletion response")
#         print(f"Exception: {e}")
#         return e

@retry(wait=wait_random_exponential(min=1, max=40), stop=stop_after_attempt(3))
def chat_completion_request(messages, functions=None, model=GPT_MODEL):
    headers = {
        "Content-Type": "application/json",
        "Authorization": "Bearer " + openai.api_key,
    }
    json_data = {"model": model, "messages": messages}
    if functions is not None:
        json_data.update({"functions": functions})
    try:
        response = openai.ChatCompletion.create(
        engine=GPT_MODEL,
        messages=messages,
        functions=functions,
        function_call="auto",  # auto is default, but we'll be explicit
    )
        return response
    except Exception as e:
        print("Unable to generate ChatCompletion response")
        print(f"Exception: {e}")
        return e


In [12]:
class Conversation:
    def __init__(self):
        self.conversation_history = []

    def add_message(self, role, content):
        message = {"role": role, "content": content}
        self.conversation_history.append(message)

    def display_conversation(self, detailed=False):
        role_to_color = {
            "system": "red",
            "user": "green",
            "assistant": "blue",
            "function": "magenta",
        }
        for message in self.conversation_history:
            print(
                colored(
                    f"{message['role']}: {message['content']}\n\n",
                    role_to_color[message["role"]],
                )
            )

In [13]:
# Initiate our get_articles and read_article_and_summarize functions
arxiv_functions = [
    {
        "name": "get_articles",
        "description": """Use this function to get academic papers from arXiv to answer user questions.""",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": f"""
                            User query in JSON. Responses should be summarized and should include the article URL reference
                            """,
                }
            },
            "required": ["query"],
        }
    },{
        "name": "read_article_and_summarize",
        "description": """Use this function to read whole papers and provide a summary for users.
        You should NEVER call this function before get_articles has been called in the conversation.""",
        "parameters": {
            "type": "object",
            "properties": {
                "query": {
                    "type": "string",
                    "description": f"""
                            Description of the article in plain text based on the user's query
                            """,
                }
            },
            "required": ["query"],
        },
    }
]


In [14]:
# # Initiate our get_articles and read_article_and_summarize functions
# arxiv_functions = [
#     {
#         "name": "get_articles",
#         "description": """Use this function to get academic papers from arXiv to answer user questions.""",
#         "parameters": {
#             "type": "object",
#             "properties": {
#                 "query": {
#                     "type": "string",
#                     "description": f"""
#                             User query in JSON. Responses should be summarized and should include the article URL reference
#                             """,
#                 }
#             },
#             "required": ["query"],
#         },
#         "name": "read_article_and_summarize",
#         "description": """Use this function to read whole papers and provide a summary for users.
#         You should NEVER call this function before get_articles has been called in the conversation.""",
#         "parameters": {
#             "type": "object",
#             "properties": {
#                 "query": {
#                     "type": "string",
#                     "description": f"""
#                             Description of the article in plain text based on the user's query
#                             """,
#                 }
#             },
#             "required": ["query"],
#         },
#     }
# ]

In [15]:
def chat_completion_with_function_execution(messages, functions=[None]):
    """This function makes a ChatCompletion API call with the option of adding functions"""
    response = chat_completion_request(messages, functions)
    print(response.json())
    full_message = response.json()["choices"][0]
    if full_message["finish_reason"] == "function_call":
        print(f"Function generation requested, calling function")
        return call_arxiv_function(messages, full_message)
    else:
        print(f"Function not required, responding to user")
        return response.json()

def call_arxiv_function(messages, full_message):
    """Function calling function which executes function calls when the model believes it is necessary.
    Currently extended by adding clauses to this if statement."""

    if full_message["message"]["function_call"]["name"] == "get_articles":
        try:
            parsed_output = json.loads(
                full_message["message"]["function_call"]["arguments"]
            )
            print("Getting search results")
            results = get_articles(parsed_output["query"])
        except Exception as e:
            print(parsed_output)
            print(f"Function execution failed")
            print(f"Error message: {e}")
        messages.append(
            {
                "role": "function",
                "name": full_message["message"]["function_call"]["name"],
                "content": str(results),
            }
        )
        try:
            print("Got search results, summarizing content")
            response = chat_completion_request(messages)
            return response.json()
        except Exception as e:
            print(type(e))
            raise Exception("Function chat request failed")

    elif (
        full_message["message"]["function_call"]["name"] == "read_article_and_summarize"
    ):
        parsed_output = json.loads(
            full_message["message"]["function_call"]["arguments"]
        )
        print("Finding and reading paper")
        summary = summarize_text(parsed_output["query"])
        return summary

    else:
        raise Exception("Function does not exist and cannot be called")

In [16]:
# Start with a system message
paper_system_message = """You are arXivGPT, a helpful assistant pulls academic papers to answer user questions.
You summarize the papers clearly so the customer can decide which to read to answer their question.
You always provide the article_url and title so the user can understand the name of the paper and click through to access it.
Begin!"""
paper_conversation = Conversation()
paper_conversation.add_message("system", paper_system_message)

In [19]:
# Add a user message
paper_conversation.add_message("user", "Hi, how does GPT and Transformer work?")
chat_response = chat_completion_with_function_execution(
    paper_conversation.conversation_history, functions=arxiv_functions
)
assistant_message = chat_response["choices"][0]["message"]["content"]
paper_conversation.add_message("assistant", assistant_message)
display(Markdown(assistant_message))

Unable to generate ChatCompletion response
Exception: Unrecognized request argument supplied: functions


AttributeError: 'InvalidRequestError' object has no attribute 'json'

In [20]:
# Add another user message to induce our system to use the second tool
paper_conversation.add_message(
    "user",
    "Can you read the Language Models for Speech Recognition paper for me and give me a summary",
)

updated_response = chat_completion_with_function_execution(
    paper_conversation.conversation_history, functions=arxiv_functions
)
display(Markdown(updated_response["choices"][0]["message"]["content"]))

Function generation requested, calling function
Finding and reading paper
Chunking text from paper
Summarizing each chunk of text


100%|██████████| 6/6 [00:07<00:00,  1.29s/it]


Summarizing into overall summary


Core Argument:
- This academic paper discusses the adaptation of language models (LMs) such as GPT, GPT-2, and BERT for automatic speech recognition (ASR).
- The paper presents experimental results showing that fine-tuned GPT, GPT-2, and their combination outperform neural LMs trained from scratch on in-domain text.
- The paper proposes a conversion method to compute the correct language prior probability based on bidirectional LM outputs, enabling BERT to achieve further improvements in ASR performance.
- The paper highlights the limited number of studies on the use of GPT and BERT in ASR and provides a comprehensive review of the Transformer-based LM structures used in these models.

Evidence:
- Experimental results show that fine-tuned GPT, GPT-2, and their combination outperform neural LMs trained from scratch on in-domain text.
- The paper presents a conversion method to compute the correct language prior probability based on bidirectional LM outputs, enabling BERT to achieve further improvements in ASR performance.
- The paper provides results of word level LMs trained from scratch on the AMI corpus and evaluated on different datasets.
- The experimental setup includes training data, acoustic models, and 100-best lists for rescoring.

Conclusions:
- Combining fine-tuned GPT, GPT-2, and BERT models achieves the lowest word error rates (WERs) in ASR.
- Fine-tuning pre-trained LMs on in-domain datasets outperforms training LMs from scratch with only in-domain data.
- The paper suggests that it may be more effective and efficient to fine-tune existing pre-trained LMs rather than building new LMs from scratch.
- Optimization methods during inference should be considered for large pre-trained models.
- The paper highlights the limited number of studies on the use of GPT and BERT in ASR and provides a comprehensive review of the Transformer-based LM structures used in these models.