# Deploy

In [1]:
import configparser

config = configparser.ConfigParser()
config.read('../config.ini')

username = config['credentials']['username']
password = config['credentials']['password']

bot_name = config['credentials']['bot_name']
bot_pass = config['credentials']['bot_pass']

DEFAULT_HOST_URL = 'https://speakeasy.ifi.uzh.ch'

## Login

In [2]:
from speakeasypy import Speakeasy
speakeasy = Speakeasy(host=DEFAULT_HOST_URL, username=username, password=password)
speakeasy.login()  
# This framework will help you log out automatically when the program terminates.

ModuleNotFoundError: No module named 'speakeasypy'

## Chatrooms

In [7]:
# Only check active chatrooms (i.e., remaining_time > 0) if active=True.
rooms = speakeasy.get_rooms(active=True)

In [8]:
rooms

[]

## Data

In [1]:
# import networkx
# import plotly

from rdflib.namespace import Namespace, RDF, RDFS, XSD
from rdflib.term import URIRef, Literal
import csv
import json
import pandas as pd
import rdflib
from collections import defaultdict, Counter
import locale
_ = locale.setlocale(locale.LC_ALL, '')

In [2]:
graph = rdflib.Graph()
graph.parse('../data/14_graph.nt', format='turtle')

<Graph identifier=N0e15d744b4b2414aba2f223d486ee3eb (<class 'rdflib.graph.Graph'>)>

In [3]:
WD = Namespace('http://www.wikidata.org/entity/')
WDT = Namespace('http://www.wikidata.org/prop/direct/')
SCHEMA = Namespace('http://schema.org/')
DDIS = Namespace('http://ddis.ch/atai/')

In [39]:
Q = """ PREFIX ddis: <http://ddis.ch/atai/>
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>

SELECT ?lbl WHERE {
            VALUES ?movieLabel { "Harry Potter"@en }
            ?movie rdfs:label ?movieLabel .
            ?movie wdt:P6886 ?writing language .
            ?writing language rdfs:label ?lbl .
            }


"""
graph.query(Q)

ParseException: Expected SelectQuery, found 'language'  (at char 325), (line:9, col:39)

In [37]:
import numpy as np
np.random.choice([1, 2, 3])

2

In [40]:
ent2lbl = {ent: str(lbl) for ent, lbl in graph.subject_objects(RDFS.label)}
lbl2ent = {lbl: ent for ent, lbl in ent2lbl.items()}

entity_emb = np.load('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/entity_embeds.npy')
relation_emb = np.load('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/relation_embeds.npy')

with open('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    id2ent = {v: k for k, v in ent2id.items()}
with open('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

df_rel_extended = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_rel_extend.pkl")
# self.lbl2rel = {row['relation']: row['relation_label'] for idx, row in df_rel_extended.iterrows()}
# self.rel2lbl = {row['relation_label']: row['relation'] for idx, row in df_rel_extended.iterrows()}

lbl2rel = {row['label']: row['id'] for idx, row in df_rel_extended.iterrows()}
rel2lbl = {row['id']: row['label'] for idx, row in df_rel_extended.iterrows()}

In [45]:
"Harry Potter" in lbl2ent.keys()

True

In [50]:
lbl2rel

{'director': 'P57',
 'directed by': 'P57',
 'directors': 'P57',
 'directed': 'P57',
 'director of': 'P57',
 'the director': 'P57',
 'screenwriter': 'P58',
 'publication date': 'P577',
 'release date': 'P577',
 'was released': 'P577',
 'genre': 'P136',
 'type': 'P136',
 'kind': 'P136',
 'category': 'P136',
 'theme': 'P136',
 'MPAA film rating': 'P1657',
 'score': 'P1657',
 'stars': 'P1657',
 'rate': 'P1657',
 'the MPAA film rating': 'P1657',
 'cast': 'P161',
 'cast member': 'P161',
 'actor': 'P161',
 'actress': 'P161',
 'played': 'P161',
 'image': 'image',
 'picture': 'image',
 'photo': 'image',
 'poster': 'image',
 'look like': 'image',
 'portrait': 'image',
 'depiction': 'image',
 'illustration': 'image',
 'drawing': 'image',
 'sketch': 'image',
 'snapshot': 'image',
 'recommend': 'recommend',
 'suggest': 'recommend',
 'advice': 'recommend',
 'advise': 'recommend',
 'propose': 'recommend',
 'recommendation': 'recommend',
 'suggestion': 'recommend',
 'proposal': 'recommend',
 'recommen

In [46]:
ent2id

{rdflib.term.URIRef('http://www.wikidata.org/entity/Q548978'): 0,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q239453'): 1,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q218960'): 2,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q1164083'): 3,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q1185321'): 4,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q9610'): 5,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q20666646'): 6,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q168383'): 7,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q65971152'): 8,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q33999'): 9,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q100595898'): 10,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q104839741'): 11,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q429700'): 12,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q96474589'): 13,
 rdflib.term.URIRef('http://www.wikidata.org/entity/Q19656156

In [61]:
# ent2id[WDT[lbl2ent['Harry Potter']]]
ent2id[lbl2ent['Harry Potter']]

1517

In [51]:
WDT[lbl2ent['Harry Potter']]

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/http://www.wikidata.org/entity/Q8337')

In [None]:
        if label in self.lbl2rel.keys():
            return relation_emb[rel2id[WDT[self.lbl2rel[label]]]]
        elif label in self.lbl2ent.keys():
            return entity_emb[ent2id[self.lbl2ent[label]]]

In [None]:
ent_embs = [entity_emb[ent2id[lbl2ent[label]]] for ent in entities if ent in lbl2ent.keys()]


In [59]:
WDT[lbl2ent['Harry Potter']]

rdflib.term.URIRef('http://www.wikidata.org/prop/direct/http://www.wikidata.org/entity/Q8337')

In [60]:
'Q8337' in [str(e).split('/')[-1] for e in ent2id]

True

In [63]:
lbl2rel

{'director': 'P57',
 'directed by': 'P57',
 'directors': 'P57',
 'directed': 'P57',
 'director of': 'P57',
 'the director': 'P57',
 'screenwriter': 'P58',
 'publication date': 'P577',
 'release date': 'P577',
 'was released': 'P577',
 'genre': 'P136',
 'type': 'P136',
 'kind': 'P136',
 'category': 'P136',
 'theme': 'P136',
 'MPAA film rating': 'P1657',
 'score': 'P1657',
 'stars': 'P1657',
 'rate': 'P1657',
 'the MPAA film rating': 'P1657',
 'cast': 'P161',
 'cast member': 'P161',
 'actor': 'P161',
 'actress': 'P161',
 'played': 'P161',
 'image': 'image',
 'picture': 'image',
 'photo': 'image',
 'poster': 'image',
 'look like': 'image',
 'portrait': 'image',
 'depiction': 'image',
 'illustration': 'image',
 'drawing': 'image',
 'sketch': 'image',
 'snapshot': 'image',
 'recommend': 'recommend',
 'suggest': 'recommend',
 'advice': 'recommend',
 'advise': 'recommend',
 'propose': 'recommend',
 'recommendation': 'recommend',
 'suggestion': 'recommend',
 'proposal': 'recommend',
 'recommen

In [64]:
relation_emb[rel2id[WDT[lbl2rel['director']]]]

array([-1.33676361e+02, -9.00684128e+01,  9.15138168e+01,  1.41997632e+03,
       -5.46068764e+01, -5.30493450e+00, -4.55188894e+00, -1.84354210e+01,
       -6.24425964e+01,  2.04698067e+01, -3.32123108e+01,  7.82825661e+00,
        4.86940346e+01, -2.93750057e+01,  1.68282146e+01, -8.12460876e+02,
       -4.25059166e+01,  6.51136856e+01, -1.83176613e+01, -5.81603050e+01,
       -2.38190575e+01,  1.30369604e+03,  2.09224594e+02, -8.91351013e+01,
       -2.61498280e+01,  9.48086304e+02,  3.19745331e+01, -9.69241257e+01,
       -2.52839108e+01, -5.61198654e+01,  1.15285370e+02,  1.37968960e+01,
        4.77943611e+00,  8.22500305e+01,  2.25675049e+01, -3.98287811e+01,
       -8.24141788e+00, -2.73283478e+02, -6.28208542e+01, -1.01843243e+03,
       -2.32064800e+01,  2.65104103e+01,  5.19264603e+00,  1.13361763e+02,
       -1.10693121e+01, -5.51100121e+01, -2.02899475e+01, -1.77912827e+01,
        1.01491280e+02,  3.51695862e+01,  1.70231003e+02,  1.96907101e+01,
       -5.05333130e+02,  

In [65]:
relation_emb[rel2id[WDT[lbl2rel['director']]]]

array([-1.33676361e+02, -9.00684128e+01,  9.15138168e+01,  1.41997632e+03,
       -5.46068764e+01, -5.30493450e+00, -4.55188894e+00, -1.84354210e+01,
       -6.24425964e+01,  2.04698067e+01, -3.32123108e+01,  7.82825661e+00,
        4.86940346e+01, -2.93750057e+01,  1.68282146e+01, -8.12460876e+02,
       -4.25059166e+01,  6.51136856e+01, -1.83176613e+01, -5.81603050e+01,
       -2.38190575e+01,  1.30369604e+03,  2.09224594e+02, -8.91351013e+01,
       -2.61498280e+01,  9.48086304e+02,  3.19745331e+01, -9.69241257e+01,
       -2.52839108e+01, -5.61198654e+01,  1.15285370e+02,  1.37968960e+01,
        4.77943611e+00,  8.22500305e+01,  2.25675049e+01, -3.98287811e+01,
       -8.24141788e+00, -2.73283478e+02, -6.28208542e+01, -1.01843243e+03,
       -2.32064800e+01,  2.65104103e+01,  5.19264603e+00,  1.13361763e+02,
       -1.10693121e+01, -5.51100121e+01, -2.02899475e+01, -1.77912827e+01,
        1.01491280e+02,  3.51695862e+01,  1.70231003e+02,  1.96907101e+01,
       -5.05333130e+02,  

In [68]:
entities = [{'entity': 'harry potter', 'ent_start': 10, 'ent_end': 21, 'ent_type': 'CHARACTER', 'confidence': '0.84095', 'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q3244512'), 'label': 'Harry Potter', 'match_score': 100.0}}]
entities[0]
relation = 'director'

In [82]:
def get_embedding(entities, relation):
    for ent in entities:
        label = ent['mapping']['label']
        print(f"Entity label: {label}")
        try:
            ent_embs = [entity_emb[ent2id[lbl2ent[label]]] for ent in entities if ent['mapping']['label'] in lbl2ent.keys()]
        except KeyError:
            ent_embs = []
    ent_emb = np.mean(ent_embs, axis=0) if len(ent_embs) > 0 else np.zeros(256)
    print(f"len(ent_embs): {len(ent_embs)}")
    relation_emb[rel2id[WDT[lbl2rel[relation]]]] if relation in lbl2rel.keys() else np.zeros(256)
    print("Got embeddings.")
    return ent_emb, rel_emb

In [85]:
rel_emb = relation_emb[rel2id[WDT[lbl2rel[relation]]]] if relation in lbl2rel.keys() else np.zeros(256)
rel_emb

array([-1.33676361e+02, -9.00684128e+01,  9.15138168e+01,  1.41997632e+03,
       -5.46068764e+01, -5.30493450e+00, -4.55188894e+00, -1.84354210e+01,
       -6.24425964e+01,  2.04698067e+01, -3.32123108e+01,  7.82825661e+00,
        4.86940346e+01, -2.93750057e+01,  1.68282146e+01, -8.12460876e+02,
       -4.25059166e+01,  6.51136856e+01, -1.83176613e+01, -5.81603050e+01,
       -2.38190575e+01,  1.30369604e+03,  2.09224594e+02, -8.91351013e+01,
       -2.61498280e+01,  9.48086304e+02,  3.19745331e+01, -9.69241257e+01,
       -2.52839108e+01, -5.61198654e+01,  1.15285370e+02,  1.37968960e+01,
        4.77943611e+00,  8.22500305e+01,  2.25675049e+01, -3.98287811e+01,
       -8.24141788e+00, -2.73283478e+02, -6.28208542e+01, -1.01843243e+03,
       -2.32064800e+01,  2.65104103e+01,  5.19264603e+00,  1.13361763e+02,
       -1.10693121e+01, -5.51100121e+01, -2.02899475e+01, -1.77912827e+01,
        1.01491280e+02,  3.51695862e+01,  1.70231003e+02,  1.96907101e+01,
       -5.05333130e+02,  

## NER

In [1]:
import sys
sys.path.append('../')
from models.entity import MovieEntityProcessor, EntityProcessor



In [2]:
q = "What's the rating of Harry Potter and Chamber of secrets?"

In [3]:
mep = MovieEntityProcessor()
mep.process(q)
mep.entities

ner_mit_movie_simple_distilbert_base_cased download started this may take some time.
Approximate size to download 15.5 MB
[OK!]


[{'entity': 'Harry Potter and Chamber of secrets',
  'ent_start': 21,
  'ent_end': 55,
  'ent_type': 'TITLE',
  'confidence': '0.88115007'}]

In [4]:
ep = EntityProcessor()
ep.process(q)
ep.entities

bert_base_token_classifier_conll03 download started this may take some time.
Approximate size to download 385.4 MB
[OK!]


[{'entity': 'Harry Potter',
  'ent_start': 21,
  'ent_end': 32,
  'ent_type': 'MISC',
  'confidence': '0.896088'},
 {'entity': 'Chamber of secrets',
  'ent_start': 38,
  'ent_end': 55,
  'ent_type': 'ORG',
  'confidence': '0.6660025'}]

In [5]:
from models.postprocess import PostProcessor

In [6]:
pp = PostProcessor(q, mep.entities, ep.entities)

In [7]:
pp.process()

{'movie_entities': [{'entity': 'Harry Potter and Chamber of secrets',
   'ent_start': 21,
   'ent_end': 55,
   'ent_type': 'TITLE',
   'confidence': '0.88115007',
   'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'),
    'label': 'Harry Potter and the Chamber of Secrets',
    'match_score': 94.5945945945946}}],
 'all_entities': [{'entity': 'Harry Potter',
   'ent_start': 21,
   'ent_end': 32,
   'ent_type': 'MISC',
   'confidence': '0.896088',
   'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q3244512'),
    'label': 'Harry Potter'}},
  {'entity': 'Chamber of secrets',
   'ent_start': 38,
   'ent_end': 55,
   'ent_type': 'ORG',
   'confidence': '0.6660025',
   'mapping': False}],
 'merged_entities': [{'entity': 'Harry Potter and Chamber of secrets',
   'ent_start': 21,
   'ent_end': 55,
   'ent_type': 'TITLE',
   'confidence': '0.88115007',
   'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'),
    'lab

In [8]:
pp.all_entities, pp.movie_entities

([{'entity': 'Harry Potter',
   'ent_start': 21,
   'ent_end': 32,
   'ent_type': 'MISC',
   'confidence': '0.896088',
   'mapping': {'uri': {155379: rdflib.term.URIRef('http://www.wikidata.org/entity/Q3244512'),
     155380: rdflib.term.URIRef('http://www.wikidata.org/entity/Q8337')},
    'label': {155379: 'Harry Potter', 155380: 'Harry Potter'}}},
  {'entity': 'Chamber of secrets',
   'ent_start': 38,
   'ent_end': 55,
   'ent_type': 'ORG',
   'confidence': '0.6660025',
   'mapping': False}],
 [{'entity': 'Harry Potter and Chamber of secrets',
   'ent_start': 21,
   'ent_end': 55,
   'ent_type': 'TITLE',
   'confidence': '0.88115007',
   'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'),
    'label': 'Harry Potter and the Chamber of Secrets',
    'match_score': 94.5945945945946}}])

In [9]:
from rapidfuzz import fuzz
def _entity_match(entity):
    """fuzzy matching of entity"""
    match_scores = pp.df_ent['label'].apply(lambda x: fuzz.ratio(x.lower(), entity.lower()))
    match_score = match_scores.max()
    match_idx = match_scores.idxmax()

    print(match_score, pp.df_ent.iloc[match_idx].to_dict())

    if match_score > 80:
        match_idx = match_scores.idxmax()
        match = pp.df_ent.iloc[match_idx].to_dict()
        match['match_score'] = match_score
        return match
    return False

In [10]:
match_res = _entity_match(mep.entities[0]['entity'])
# if match_res:
#     entity['mapping'] = match_res
# else:
#     entity['mapping'] = False
match_res

94.5945945945946 {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'), 'label': 'Harry Potter and the Chamber of Secrets'}


{'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'),
 'label': 'Harry Potter and the Chamber of Secrets',
 'match_score': 94.5945945945946}

In [11]:
pp.movie_entities

[{'entity': 'Harry Potter and Chamber of secrets',
  'ent_start': 21,
  'ent_end': 55,
  'ent_type': 'TITLE',
  'confidence': '0.88115007',
  'mapping': {'uri': rdflib.term.URIRef('http://www.wikidata.org/entity/Q1148981'),
   'label': 'Harry Potter and the Chamber of Secrets',
   'match_score': 94.5945945945946}}]

# Query

In [10]:
import rdflib
graph = rdflib.Graph()
graph.parse('../data/14_graph.nt', format='turtle')

query_string = """# Who directed the movie Apocalypse Now?  

PREFIX ddis: <http://ddis.ch/atai/>   
PREFIX wd: <http://www.wikidata.org/entity/>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>
PREFIX schema: <http://schema.org/>   

SELECT ?director WHERE {  
    ?movie rdfs:label "Apocalypse Now"@en .  
        ?movie wdt:P57 ?directorItem . 
    ?directorItem rdfs:label ?director . 
}  

LIMIT 1  """

[str(s) for s, in graph.query(query_string)]

['Francis Ford Coppola']

In [15]:
# top user-rated movies
[str(s) for s, in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
PREFIX ddis: <http://ddis.ch/atai/>   
PREFIX wd: <http://www.wikidata.org/entity/>   
PREFIX wdt: <http://www.wikidata.org/prop/direct/>   
PREFIX schema: <http://schema.org/>   

SELECT ?lbl WHERE {  
    ?movie wdt:P31 wd:Q11424 .  
    ?movie ddis:rating ?rating .  
    ?movie rdfs:label ?lbl .  
}  

ORDER BY DESC(?rating)   
LIMIT 1 
    ''')]

['Forrest Gump']

In [33]:
# top user-rated movies
[s for s, in graph.query('''
    PREFIX ddis: <http://ddis.ch/atai/> 
PREFIX ddis: <http://ddis.ch/atai/>   
PREFIX wd: <http://www.wikidata.org/entity/>   
PREFIX wdt: <http://www.wikidata.org/prop/direct/>   
PREFIX schema: <http://schema.org/>   

SELECT ?lbl WHERE {  
    ?movie wdt:P31 wd:Q11424 .  
    ?movie ddis:rating ?rating .  
    ?movie rdfs:label ?lbl .  
}  

ORDER BY DESC(?rating)   
LIMIT 1 
    ''')]

[rdflib.term.Literal('Forrest Gump', lang='en')]

## Agent

In [14]:
from speakeasypy import Chatroom
from typing import List
import time

DEFAULT_HOST_URL = 'https://speakeasy.ifi.uzh.ch'
listen_freq = 2


class Agent:
    def __init__(self, username, password):
        self.username = username
        # Initialize the Speakeasy Python framework and login.
        self.speakeasy = Speakeasy(host=DEFAULT_HOST_URL, username=username, password=password)
        self.speakeasy.login()  # This framework will help you log out automatically when the program terminates.

    def listen(self):
        while True:
            # only check active chatrooms (i.e., remaining_time > 0) if active=True.
            rooms: List[Chatroom] = self.speakeasy.get_rooms(active=True)
            for room in rooms:
                if not room.initiated:
                    # send a welcome message if room is not initiated
                    room.post_messages(f'Hello! This is a welcome message from {room.my_alias}.')
                    room.initiated = True
                # Retrieve messages from this chat room.
                # If only_partner=True, it filters out messages sent by the current bot.
                # If only_new=True, it filters out messages that have already been marked as processed.
                for message in room.get_messages(only_partner=True, only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new message #{message.ordinal}: '{message.message}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #
                    # top user-rated movies

                    # Send a message to the corresponding chat room using the post_messages method of the room object.
                    room.post_messages(f"Received your message: '{message.message}' ")
                    # Mark the message as processed, so it will be filtered out when retrieving new messages.
                    room.mark_as_processed(message)

                # Retrieve reactions from this chat room.
                # If only_new=True, it filters out reactions that have already been marked as processed.
                for reaction in room.get_reactions(only_new=True):
                    print(
                        f"\t- Chatroom {room.room_id} "
                        f"- new reaction #{reaction.message_ordinal}: '{reaction.type}' "
                        f"- {self.get_time()}")

                    # Implement your agent here #

                    room.post_messages(f"Received your reaction: '{reaction.type}' ")
                    room.mark_as_processed(reaction)

            time.sleep(listen_freq)

    @staticmethod
    def get_time():
        return time.strftime("%H:%M:%S, %d-%m-%Y", time.localtime())


if __name__ == '__main__':
    demo_bot = Agent(bot_name, bot_pass)
    demo_bot.listen()


Login successful. Session token: 4Jf5mEfPTN_mQSnyr2diXUQuyDZ_lybJ
	- Chatroom 05910dd3-ad3a-41d1-992a-196979290da1 - new message #1: 'PREFIX ddis: <http://ddis.ch/atai/>   
PREFIX wd: <http://www.wikidata.org/entity/>   
PREFIX wdt: <http://www.wikidata.org/prop/direct/>   
PREFIX schema: <http://schema.org/>   

SELECT ?lbl WHERE {  
    ?movie wdt:P31 wd:Q11424 .  
    ?movie ddis:rating ?rating .  
    ?movie rdfs:label ?lbl .  
}  

ORDER BY DESC(?rating)   
LIMIT 1 ' - 13:44:46, 02-10-2024


KeyboardInterrupt: 

# Extract

In [None]:
import re
from typing import Dict, List, Tuple, Optional

class MovieQueryAnalyzer:
    def __init__(self, roles_ner: Dict[str, str], actions_ner: Dict[str, str], 
                 predicates_ner: Dict[str, str], numbers_ner: List[str]):
        self.roles_ner = roles_ner
        self.actions_ner = actions_ner
        self.predicates_ner = predicates_ner
        self.numbers_ner = numbers_ner
        
        # Question words and common phrases to remove
        self.question_patterns = [
            r"Who|What|When|How many|can you|Tell me|given that|does|do|did|let|us|me",
            r"is|are|I like|movie like\b|\bof\b|\?|\.",
            r"\bthe\b|\ba\b|\ban\b|\bin\b|\bon\b|\bat\b|\bto\b|\bfor\b",
            r"please|could you|would you|show|list|find|get",
            r"movies?|films?|pictures?|productions?|features?",
        ]
        
    def clean_text(self, text: str, entities: List[str] = None) -> str:
        """
        Clean text by removing entities and common question words/phrases
        
        Args:
            text: Input text to clean
            entities: List of entities to remove from text
            
        Returns:
            Cleaned text string
        """
        # Remove entities if provided
        if entities:
            entity_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, entities)) + r')\b', 
                                      re.IGNORECASE)
            text = entity_pattern.sub("", text)
        
        # Remove question words and common phrases
        for pattern in self.question_patterns:
            text = re.sub(pattern, "", text, flags=re.IGNORECASE)
            
        # Clean up whitespace and punctuation
        text = re.sub(r'\s+', ' ', text)
        text = text.strip().replace('"', '').replace('  ', ' ')
        return text
        
    def extract_entities(self, text: str) -> Dict[str, List[str]]:
        """
        Extract named entities from text using predefined dictionaries
        """
        entities = {
            'people': [],
            'movies': [],
            'dates': [],
            'other': []
        }
        
        # Extract dates (years)
        year_pattern = r'\b(?:19|20)\d{2}\b'
        entities['dates'] = re.findall(year_pattern, text)
        
        # Extract potential movie titles (capitalized phrases)
        movie_pattern = r'\b(?:[A-Z][a-z]+\s*)+\b'
        potential_movies = re.findall(movie_pattern, text)
        entities['movies'] = [m for m in potential_movies if len(m.split()) > 1]
        
        return entities
        
    def get_relation_type(self, cleaned_text: str) -> Tuple[str, float]:
        """
        Identify the type of relation being queried
        
        Returns:
            Tuple of (relation_type, confidence_score)
        """
        relation_patterns = {
            'DIRECTED_BY': (r'direct|filmmaker|made by', 0.9),
            'ACTED_IN': (r'acted|starred|appeared|played|cast', 0.9),
            'RELEASED_IN': (r'released|came out|premiered|debut', 0.8),
            'GENRE_IS': (r'genre|type|category|kind of', 0.8),
            'RATED_AS': (r'rated|rating|score|review', 0.7),
            'COLLABORATED_IN': (r'together|both|collaborated|work together', 0.9),
            'PRODUCED_BY': (r'produced|made|created|developed', 0.7)
        }
        
        matches = []
        for rel_type, (pattern, base_score) in relation_patterns.items():
            if re.search(pattern, cleaned_text, re.IGNORECASE):
                # Adjust confidence based on pattern specificity
                confidence = base_score * len(re.findall(pattern, cleaned_text, re.IGNORECASE))
                matches.append((rel_type, confidence))
                
        return max(matches, key=lambda x: x[1]) if matches else ('UNKNOWN', 0.0)
        
    def generate_sparql_query(self, question: str) -> Dict[str, any]:
        """
        Generate complete SPARQL query with entities and relations
        """
        # Extract entities first
        entities = self.extract_entities(question)
        
        # Clean the text
        cleaned_text = self.clean_text(question, 
                                     entities=[e for sublist in entities.values() for e in sublist])
        
        # Get relation type
        relation_type, confidence = self.get_relation_type(cleaned_text)
        
        # Build query based on relation type
        query_template = self.get_query_template(relation_type)
        
        return {
            'original_question': question,
            'cleaned_text': cleaned_text,
            'entities': entities,
            'relation': relation_type,
            'confidence': confidence,
            'query_template': query_template
        }
        
    def get_query_template(self, relation_type: str) -> str:
        """
        Get SPARQL query template based on relation type
        """
        templates = {
            'DIRECTED_BY': """
                SELECT DISTINCT ?movie ?director
                WHERE {
                  ?movie wdt:P31 wd:Q11424;
                         wdt:P57 ?director.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                }
            """,
            'COLLABORATED_IN': """
                SELECT DISTINCT ?movie
                WHERE {
                  ?movie wdt:P31 wd:Q11424;
                         wdt:P161 ?actor1;
                         wdt:P161 ?actor2.
                  FILTER(?actor1 != ?actor2)
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                }
            """,
            'RELEASED_IN': """
                SELECT DISTINCT ?movie ?date
                WHERE {
                  ?movie wdt:P31 wd:Q11424;
                         wdt:P577 ?date.
                  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
                }
            """
        }
        
        return templates.get(relation_type, """
            SELECT DISTINCT ?movie
            WHERE {
              ?movie wdt:P31 wd:Q11424.
              SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
            }
        """)

# Example usage:
def process_question(question: str) -> None:
    analyzer = MovieQueryAnalyzer(roles_ner, actions_ner, predicates_ner, numbers_ner)
    result = analyzer.generate_sparql_query(question)
    
    print(f"Original: {result['original_question']}")
    print(f"Cleaned: {result['cleaned_text']}")
    print(f"Entities: {result['entities']}")
    print(f"Relation: {result['relation']} (confidence: {result['confidence']:.2f})")
    print("\nSPARQL Template:")
    print(result['query_template'])

In [38]:
import re
import pandas as pd
t = "Who directed and see the movie Apocalypse Now?"
# t = "Is mike an actor?"


desc_pattern = re.compile(r'\b(?:' + '|'.join(map(re.escape, ['Harry Potter and Chamber of secrets', 'movie'])) + r')\b', re.IGNORECASE)
t_rem = desc_pattern.sub("", t)
for old, new in [(r"What|Who|When|How many|can you|tell me|show me|give me|given that|'s|'re|does|do|did|let|us|me|is|are|I like|movie like\b|\bof\b|\?|\.","")]:
    t_rem = re.sub(old, new, t_rem, flags=re.IGNORECASE)
    t_rem = t_rem.strip().replace('"', '')
    print(t_rem)
# rel_dict = {t_rem: self.relation_recognizer.get_relation(t_rem)}

directed and see the  Apocalypse Now


In [63]:
df_rel_extended = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_rel_extend.pkl")
df_rel_extended

Unnamed: 0,id,label
0,P57,director
1,P57,directed by
2,P57,directors
3,P57,directed
4,P57,director of
...,...,...
311,P1875,represented by
312,P3931,copyright holder
313,P466,occupant
314,P7327,OFLC classification


In [37]:
df_rel['author of']

NameError: name 'df_rel' is not defined

In [2]:
PAT_EXTRACT = r"\b(What|When|Where|Why|How|How many|Tell me|Tell me about|Show me|Show me about|show|give|Give me|Let me know|Do you know|Who|Which|is|are|was|were|will|would|could|can|should|does|do|did|the|a|an|in|on|at|for|with|by|to|'s|'re|be|being|been)\b|\.|\?"

In [3]:
for pat, rep in [(PAT_EXTRACT, "")]:
    cleaned_text = "Show Top 5 horror movies"
    cleaned_text = re.sub(pat, rep, cleaned_text, flags=re.IGNORECASE)
    relation = cleaned_text.strip().replace('"', '')

In [30]:
DESC = False
AESC = False
desc_pattern = r"(?:top|best|highest|most|good|lowest|worse|bad|bottom|least)?\s*(\d+|one|two|three|four|five)\s*(?:good|best|highest|most|top|lowest|worse|bad|bottom|least)?"
matches = re.findall(desc_pattern, q, re.IGNORECASE)
for match in matches:
    number = matches[0] if matches[0] else 1 # Default to "1" if no number is found
    if re.findall(r"top|best|highest|most|good", q, re.IGNORECASE):
        DESC = True
    elif re.findall(r"lowest|worse|bad|bottom|least", q, re.IGNORECASE):
        AESC = True


In [31]:
matches

['five']

In [13]:
DESC

True

In [35]:
!export REPLICATE_API_TOKEN=r8_OBGb1aZnZ8lG3Z6ZeLSTy1i3LEFdPzx0CwQd0

'export' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
import csv, rdflib
with open('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/entity_ids.del', 'r') as ifile:
    ent2id = {rdflib.term.URIRef(ent): int(idx) for idx, ent in csv.reader(ifile, delimiter='\t')}
    sid2ent = {v: k for k, v in ent2id.items()}


In [6]:
sid2ent

{0: rdflib.term.URIRef('http://www.wikidata.org/entity/Q548978'),
 1: rdflib.term.URIRef('http://www.wikidata.org/entity/Q239453'),
 2: rdflib.term.URIRef('http://www.wikidata.org/entity/Q218960'),
 3: rdflib.term.URIRef('http://www.wikidata.org/entity/Q1164083'),
 4: rdflib.term.URIRef('http://www.wikidata.org/entity/Q1185321'),
 5: rdflib.term.URIRef('http://www.wikidata.org/entity/Q9610'),
 6: rdflib.term.URIRef('http://www.wikidata.org/entity/Q20666646'),
 7: rdflib.term.URIRef('http://www.wikidata.org/entity/Q168383'),
 8: rdflib.term.URIRef('http://www.wikidata.org/entity/Q65971152'),
 9: rdflib.term.URIRef('http://www.wikidata.org/entity/Q33999'),
 10: rdflib.term.URIRef('http://www.wikidata.org/entity/Q100595898'),
 11: rdflib.term.URIRef('http://www.wikidata.org/entity/Q104839741'),
 12: rdflib.term.URIRef('http://www.wikidata.org/entity/Q429700'),
 13: rdflib.term.URIRef('http://www.wikidata.org/entity/Q96474589'),
 14: rdflib.term.URIRef('http://www.wikidata.org/entity/Q1965

In [7]:
with open('D:/Project/ATiAI-speakeasy/data/ddis-graph-embeddings/relation_ids.del', 'r') as ifile:
    rel2id = {rdflib.term.URIRef(rel): int(idx) for idx, rel in csv.reader(ifile, delimiter='\t')}
    id2rel = {v: k for k, v in rel2id.items()}

In [9]:
id2rel

{0: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P161'),
 1: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P800'),
 2: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P103'),
 3: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P750'),
 4: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P106'),
 5: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P9086'),
 6: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P279'),
 7: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P19'),
 8: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P136'),
 9: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P344'),
 10: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P495'),
 11: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P1412'),
 12: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P57'),
 13: rdflib.term.URIRef('http://www.wikidata.org/prop/direct/P31'),
 14: rdflib.term.URIRef('http://www.wikidata.

In [13]:
df_rel_extended

Unnamed: 0,id,label
0,P57,director
1,P57,directed by
2,P57,directors
3,P57,directed
4,P57,director of
...,...,...
311,P1875,represented by
312,P3931,copyright holder
313,P466,occupant
314,P7327,OFLC classification


In [14]:
df_rel_extended = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_rel_extend.pkl")

lbl2rel = {row['label']: row['id'] for idx, row in df_rel_extended.iterrows()}
rel2lbl = {row['id']: row['label'] for idx, row in df_rel_extended.iterrows()}

In [11]:
import pandas as pd

In [15]:
lbl2rel

{'director': 'P57',
 'directed by': 'P57',
 'directors': 'P57',
 'directed': 'P57',
 'director of': 'P57',
 'the director': 'P57',
 'screenwriter': 'P58',
 'publication date': 'P577',
 'release date': 'P577',
 'was released': 'P577',
 'genre': 'P136',
 'type': 'P136',
 'kind': 'P136',
 'category': 'P136',
 'theme': 'P136',
 'MPAA film rating': 'P1657',
 'score': 'P1657',
 'stars': 'P1657',
 'rate': 'P1657',
 'the MPAA film rating': 'P1657',
 'cast': 'P161',
 'cast member': 'P161',
 'actor': 'P161',
 'actress': 'P161',
 'played': 'P161',
 'image': 'image',
 'picture': 'image',
 'photo': 'image',
 'poster': 'image',
 'look like': 'image',
 'portrait': 'image',
 'depiction': 'image',
 'illustration': 'image',
 'drawing': 'image',
 'sketch': 'image',
 'snapshot': 'image',
 'recommend': 'recommend',
 'suggest': 'recommend',
 'advice': 'recommend',
 'advise': 'recommend',
 'propose': 'recommend',
 'recommendation': 'recommend',
 'suggestion': 'recommend',
 'proposal': 'recommend',
 'recommen

In [156]:
import pandas as pd

df = pd.read_csv('D:/Project/ATiAI-speakeasy/dataset/14_graph.tsv', sep='\t', header=None)
df

Unnamed: 0,0,1,2
0,<http://www.wikidata.org/entity/Q20720659>,<http://www.wikidata.org/prop/direct/P5021>,<http://www.wikidata.org/entity/Q105729789>
1,<http://www.wikidata.org/entity/Q2358294>,<http://schema.org/description>,Canadian actor
2,<http://www.wikidata.org/entity/Q30139610>,<http://www.wikidata.org/prop/direct/P495>,<http://www.wikidata.org/entity/Q142>
3,<http://www.wikidata.org/entity/Q897357>,<http://www.wikidata.org/prop/direct/P345>,tt0385751
4,<http://www.wikidata.org/entity/Q3373174>,<http://www.wikidata.org/prop/direct/P31>,<http://www.wikidata.org/entity/Q5>
...,...,...,...
2056772,<http://www.wikidata.org/entity/Q18547944>,<http://www.wikidata.org/prop/direct/P161>,<http://www.wikidata.org/entity/Q16238721>
2056773,<http://www.wikidata.org/entity/Q21002577>,<http://www.wikidata.org/prop/direct/P106>,<http://www.wikidata.org/entity/Q33999>
2056774,<http://www.wikidata.org/entity/Q1551529>,<http://www.wikidata.org/prop/direct/P106>,<http://www.wikidata.org/entity/Q33999>
2056775,<http://www.wikidata.org/entity/Q16993300>,<http://www.wikidata.org/prop/direct/P161>,<http://www.wikidata.org/entity/Q20109586>


In [159]:
df[df[1].str.contains('P58')  & df[0].str.contains('Q7750525')]

Unnamed: 0,0,1,2


In [140]:
df = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_ent.pkl")
lbl = df[df['uri'].str.contains('Q315441')].iloc[0]['label']

In [149]:
df = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_ent.pkl")
lbl = df[df['uri'].str.contains('Q4165246')].iloc[0]['label']
lbl

'Bechdel test'

In [153]:
def translate_res(res):
    print(f"Translating: {res}")
    if "entity/Q" in res:
        ent_id = res.split('/')[-1].strip('>')
        df = pd.read_pickle("D:/Project/ATiAI-speakeasy/data/df_ent.pkl")
        lbl = df[df['uri'].str.contains(ent_id)].iloc[0]['label']
        print(f"Translated: {lbl}")
        return lbl
    return res

In [154]:
translate_res('<http://www.wikidata.org/entity/Q4165246>')

Translating: <http://www.wikidata.org/entity/Q4165246>
Translated: Bechdel test


'Bechdel test'

In [None]:
entity_emb[ent2id[lbl2ent['the twilight zone']]]

In [None]:
def get_embedding(self, entities, relation):
    for ent in entities:
        label = ent['mapping']['label']
        print(f"Entity label: {label}")
        try:
            # ent_embs = [self.entity_emb[self.ent2id[self.lbl2ent[label]]] for ent in entities if ent in self.lbl2ent.keys()]
            ent_embs = [entity_emb[self.ent2id[self.lbl2ent[label]]] for ent in entities if ent['mapping']['label'] in lbl2ent.keys()]
        except KeyError:
            ent_embs = np.zeros(256)
    ent_emb = np.mean(ent_embs, axis=0) if len(ent_embs) > 0 else np.zeros(256)
    print(f"len(ent_embs): {len(ent_embs)}")
    rel_emb = relation_emb[self.rel2id[WDT[lbl2rel[relation]]]] if relation in lbl2rel.keys() else np.zeros(256)
    print("Got embeddings.")
    return ent_emb, rel_emb