In [3]:
import os
import openai
import tiktoken

In [4]:
openai.api_key = os.environ.get("OPENAI_API_KEY")

In [5]:
#TODO
# improved imports using os

import json

file1 = open(r'C:\Users\resmi\rwth\kglab\CEUR-WS-Event-Series--SS23--Group-2\eventseries\src\main\resources\openai\event_series.json')

event_series = json.load(file1)

file1.close()

file2 = open(r'C:\Users\resmi\rwth\kglab\CEUR-WS-Event-Series--SS23--Group-2\eventseries\src\main\resources\openai\events.json', encoding="utf8")

events = json.load(file2)

file2.close()

In [6]:
print("Number of events present in wikidata: ", len(events))
print("Number of event series present in wikidata: ", len(event_series))

Number of events present in wikidata:  3556
Number of event series present in wikidata:  134


In [7]:
# Drop events without title value
events_with_title = []

for event in events:
    if 'title' in event:
        events_with_title.append(event)

print("number of events with a title value: ", len(events_with_title))



# Drop out event series without title
event_series_with_title = []

for binding in event_series:
    if 'title' in binding:
        event_series_with_title.append(binding)

print("number of event series with a title value: ", len(event_series_with_title))

number of events with a title value:  3473
number of event series with a title value:  119


In [8]:
# subset events with entry "series"
events_with_title_and_series = []

for binding in events:
    if 'title' in binding and 'series' in binding:
        events_with_title_and_series.append(binding)
    
print("number of events with a title value, and linked to an event series: ", len(events_with_title_and_series))

# compare linked events with event series, drop the events if there is no match, making sure that corresponding event series exist
# "series" : type uri, value wikidata link

event_series_dummy = [event['series'] for event in event_series_with_title]

events_with_title_and_series_bijective = [event for event in events_with_title_and_series if event['series'] in event_series_dummy]

print("number of events with a title value, and with respective event series present in event series list: ",len(events_with_title_and_series_bijective))

number of events with a title value, and linked to an event series:  280
number of events with a title value, and with respective event series present in event series list:  255


In [9]:
# Split data into training and testing datasets

from sklearn.model_selection import train_test_split

train, test = train_test_split(events_with_title_and_series_bijective, test_size=0.1)

In [10]:
import copy

# drop series related data from test dataset
test_dropped = copy.deepcopy(test)

for item in test_dropped:
    if "series" in item:
        del item["series"]

    if "seriesLabel" in item:
        del item["seriesLabel"]

    if "ordinal" in item:
        del item["ordinal"]

In [10]:
# prompt for context

conversation = [
    {'role': 'system', 'content': 'You are a human'},

    {'role': 'user', 'content': 'In wikidata, there are about 3400 entries which are interesting to me. Lets call these as "events". Additionally, there are different and fewer group of entries, lets call these as "event series". Almost all events are a part of event series. I will provide real examples later on, but for context, we can draw similarities to this example: if each star wars movies are "events", then the star wars itself is the "event series".'},
    {'role': 'user', 'content': 'In wikidata, the property connecting the events to event series are missing, and my task is to deduct from the title of the event which event series does this event belong to. For humans it is an easy task for sure, but noone wants to edit thousands of entries by hand. This is where you step in.'},
    {'role': 'user', 'content': 'I want you to deduct which event series does the following events belong to. To help you out, i will provide titles of some random events, and their corresponding event series to help you out with the pattern recognition. Then i will provide more events for you to find out the event series for these.'},
]

# feed training events & corresponding event series into conversation

for count, item in enumerate(train, start=1):
    conversation.append({'role': 'user', 'content': 'Event ' + str(count) + " is named '" + item["title"] + "'"})
    conversation.append({'role': 'assistant', 'content': 'The event series for Event ' + str(count) + " is '" + item["seriesLabel"] + "'"})


In [12]:
for line in conversation[4:14]:
    print(line)

{'role': 'user', 'content': "Event 1 is named '2nd INCOSE Italia Conference on Systems Engineering'"}
{'role': 'assistant', 'content': "The event series for Event 1 is 'INCOSE Italia Conference on Systems Engineering'"}
{'role': 'user', 'content': "Event 2 is named '8th Joint Workshop on Interfaces and Human Decision Making for Recommender Systems'"}
{'role': 'assistant', 'content': "The event series for Event 2 is 'Joint Workshop on Interfaces and Human Decision Making for Recommender Systems'"}
{'role': 'user', 'content': "Event 3 is named '10th International Workshop on Enterprise Modeling and Information Systems Architectures'"}
{'role': 'assistant', 'content': "The event series for Event 3 is 'International Workshop on Enterprise Modeling and Information Systems Architectures'"}
{'role': 'user', 'content': "Event 4 is named '5th International Conference on Biomedical Ontology'"}
{'role': 'assistant', 'content': "The event series for Event 4 is 'International Conference on Biomedic

In [12]:
# feed test dataset titles
deduced_event_series = []

for event in test_dropped:
    conversation.append({'role': 'user', 'content': "The event for you to find event series is " + event["title"]})


    # Send the conversation to ChatGPT
    response = openai.ChatCompletion.create(
        model='gpt-3.5-turbo-16k',
        messages=conversation,
        stop=None,
        temperature=0.7
    )

# Extract the deduced event series from the response
    deduced_event_series.append(response['choices'][0]['message']['content'])

# Add the assistant's response to the conversation history
    conversation.append({'role': 'assistant', 'content': response['choices'][0]['message']['content']})


In [20]:
print(deduced_event_series)

["The event series for the 5th International Conference on the Quality of Information and Communications Technology is 'International Conference on the Quality of Information and Communications Technology'.", "The event series for the Third Workshop on Bibliometric-enhanced Information Retrieval is 'Workshop on Bibliometric-enhanced Information Retrieval'.", "The event series for the 3rd Workshop on Managing the Evolution and Preservation of the Data Web is 'Workshop on Managing the Evolution and Preservation of the Data Web'.", "The event series for the Thirteenth International Conference on Concept Lattices and Their Applications is 'International Conference on Concept Lattices and their Applications'.", 'I\'m sorry, but I couldn\'t find a specific event series with the name "Conference on Information Technologies - Applications and Theory". It\'s possible that the event series might have a different name or it is not widely known.', "The event series for the Latin-American Workshop 

In [13]:
for event in test_dropped:
    print(event["title"])

5th International Conference on the Quality of Information and Communications Technology
Third Workshop on Bibliometric-enhanced Information Retrieval
3rd Workshop on Managing the Evolution and Preservation of the Data Web
Thirteenth International Conference on Concept Lattices and Their Applications
Conference on Information Technologies - Applications and Theory
Latin-American Workshop on Non-Monotonic Reasoning, 1st Intl. LA-NMR04 Workshop, Antiguo Colegio de San Ildefonso
4th Workshop on Managing the Evolution and Preservation of the Data Web
2nd Joint Workshop on Bibliometric-enhanced Information Retrieval and Natural Language Processing for Digital Libraries
9th Italian Convention on Computational Logic
3rd International GamiFIN Conference
International Workshop on the Semantic Web, Hawaii
9th International Workshop on Software Ecosystems
7th Workshop on Ontologies and Data in Life Sciences, ODLS 2016, organized by the GI Workgroup Ontologies in Biomedicine and Life Sciences (OBM

In [23]:
pattern = "'(.*?)'"
import re
    
extracted_texts = [re.search(pattern, s).group(1) if re.search(pattern, s) else '' for s in deduced_event_series]

In [24]:
for line in extracted_texts:
    print(line)

International Conference on the Quality of Information and Communications Technology
Workshop on Bibliometric-enhanced Information Retrieval
Workshop on Managing the Evolution and Preservation of the Data Web
International Conference on Concept Lattices and their Applications
m sorry, but I couldn
Latin-American Workshop on Non-Monotonic Reasoning
Workshop on Managing the Evolution and Preservation of the Data Web
Joint Workshop on Bibliometric-enhanced Information Retrieval and Natural Language Processing for Digital Libraries
m sorry, but I couldn
GamiFIN Conference
International Workshop on the Semantic Web
International Workshop on Software Ecosystems
Workshop on Ontologies and Data in Life Sciences
International i* Workshop
Workshop on Bridging the Gap between Human and Automated Reasoning
International Workshop on Artificial Intelligence and Cognition
European Summer School in Information Retrieval
Evaluating Semantic Textual Similarity and Textual Entailment in Portuguese
Joint 

In [11]:
print(len(test_dropped))

26


In [15]:
for org in test:
    print(org["seriesLabel"])

Quality of Information and Communications Technology
International Workshop on Bibliometric-enhanced Information Retrieval
Workshop on Managing the Evolution and Preservation of the Data Web
International Conference on Concept Lattices and their Applications
Conference on Theory and Practice of Information Technologies
Latin-American Workshop on Non-Monotonic Reasoning
Workshop on Managing the Evolution and Preservation of the Data Web
Joint Workshop on Bibliometric-enhanced Information Retrieval and Natural Language Processing for Digital Libraries
Italian Conference on Computational Logic
GamiFIN Conference
International Workshop on the Semantic Web
International Workshop on Software Ecosystems
Workshop on Ontologies and Data in Life Sciences
International i* Workshop
Workshop on Bridging the Gap between Human and Automated Reasoning
International Workshop on Artificial Intelligence and Cognition
BCS-IRSG Symposium on Future Directions in Information Access
Brazilian Symposium in Inf

In [25]:
sum = 0

for line, org in zip(extracted_texts, test):
    if line == org["seriesLabel"]:
        sum += 1

print("accuracy: ", sum/len(test))


accuracy:  0.6923076923076923
