In [18]:
# No need to use APE.
import os
os.chdir('../..')
import dspy
from dotenv import load_dotenv
from dspy.datasets import DataLoader
from llm_api import LLM_API

overall_summary = """
{text}
As a professional summarizer, create a concise and comprehensive summary of the provided text, be it an article, post, conversation, or passage, while adhering to these guidelines:
Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
Rely strictly on the provided text, without including external information.
Format the summary in paragraph form for easy understanding.
Conclude your notes with [End of Notes, Message #X] to indicate completion, where "X" represents the total number of messages that I have sent. In other words, include a message counter where you start with #1 and add 1 to the message counter every time I send a message.
By following this optimized prompt, you will generate an effective summary that encapsulates the essence of the given text in a clear, concise, and reader-friendly manner.
"""

different_perspectives = """
{text}
As a professional summarizer, summarize this text from at most 5 different perspectives. Each perspective you pick should be content-rich and reflect specific insights or themes found in the original text. Avoid generic perspective like content overview.
The summaries should be a orderded list, insightful, and tailored to the text's nuances and themes.
"""

gradually_expanding = """
{text}
Summarize TEXT by producing a series of summaries, starting with a one-sentence summary and then creating subsequent summaries that are each about twice as long as their predecessor. It is essential that each summary is a complete and thorough representation of TEXT, independent of the other summaries, so that the reader can understand the content without needing to refer to any of the other summaries for context or clarification. Create a total of 3-5 independent summaries of progressively increasing size.
"""

In [5]:
load_dotenv()
turbo = dspy.OpenAI(model='gpt-3.5-turbo-0125', api_key = os.getenv('OPENAI_API_KEY'))
dl = DataLoader()

dataset = dl.from_csv("data/wikibooks_splited.csv", fields=("text",), input_keys=("text",))

splits = dl.train_test_split(dataset, train_size=0.8) 
trainset = splits['train']
devset = splits['test']

print(len(trainset), len(devset))
print(trainset[0], devset[0], sep="\n")

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 10205.12it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 48.19it/s]
Generating train split: 1923 examples [00:00, 15738.71 examples/s]


1538 385
Example({'text': "['Examples\\nprint /d:lpt2 \\nprint /d:\\\\\\\\copyroom\\\\printer1 c:\\\\accounting\\\\\\nAdditional referencesAdditional references\\nTo send the file  in the current directory to a printer connected to LPT2 on the local computer, type:\\nTo send the file  in the c:\\\\Accounting directory to the Printer1 print queue on the \\\\\\\\CopyRoom server, type:\\nCommand-Line Syntax Key\\nPrint Command Reference\\nMode\\n']"}) (input_keys={'text'})
Example({'text': "['Ada Deutschland informiert seit der Entwicklung von Ada 83 über diese Programmiersprache, ihre Weiterentwicklung über Ada 95, Ada 2005 und Ada 2012 und die zugrunde liegenden Konzepte zur Entwicklung sicherer und zuverlässiger Software-Systeme.\\nSoftware-Systeme haben sich im Laufe der Zeit funktional massiv erweitert und vergrößert. Die Fehlerproblematik, insbesondere bei syntaktisch und semantisch nicht vollständig definierten Sprachen, kann durch Fehler vermeidende Sprachen wie Ada deutlich besse

In [19]:
output_list = []  
llm = LLM_API()
for data in dataset[:2]:
    prompt = overall_summary.format(text=data['text'])
    response = llm.generate_text(prompt)
    summary = response.split("[End of Notes, Message #1]")[0].strip()
    # print(summary)
    output_list.append({"original_text": data['text'], "transformed_test": summary, "type" : "overall_summary", "tag" : []})


# ["Ada Deutschland has been informing about the programming language Ada since its development in Ada 83, progressing through Ada 95, Ada 2005, and Ada 2012, emphasizing the creation of secure and reliable software systems. Despite the expansion and complexity of software systems over time, Ada remains a top choice for safety-critical systems due to its error-preventing nature. Ada 2012 introduced new reliability impulses with formal specification through Spark. While the focus has shifted towards IoT and Cyber Security challenges, Ada Deutschland continues to educate through conferences, workshops, online resources, and historical insights into Ada's evolution. The organization acknowledges AdaCore for their financial support in 2022 and 2023. [End of Notes, Message #1]"]

In [20]:
import json

for data in dataset[:2]:
    prompt = different_perspectives.format(text=data['text'])
    response = turbo(prompt)
    summary = response[0].split("[End of Notes, Message #1]")[0].strip()
    # print(summary)
    output_list.append({"original_text": data['text'], "transformed_test": summary, "type" : "different_perspectives", "tag" : []})
    
with open("result/genre_transformation/summary.json", "w") as f:
    f.write(json.dumps(output_list, indent=4))