In [8]:
import nltk
import re
import hashlib
from random import randint
from random import seed
from nltk.corpus import framenet as fn
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem.wordnet import WordNetLemmatizer

FUNZIONI DI PREPROCESSING

In [9]:
#il pre-processing consiste nella tokenizzazione, lemmatizzazione,
#rimozione della punteggiatura e delle stopwords di una sentence
def pre_processing(sentence):
    return set(remove_stopwords(tokenize_sentence(remove_punctuation(sentence))))

#Effettua la lemmatizzazione e rimuove le stowords da una lista di parole
def remove_stopwords(words_list):
    stopwords_list = get_stopwords()
    return [value for value in words_list if value not in stopwords_list]

#Rimuove la punteggiatura da una sentence
#Restituisce la sentence senza punteggiature
def remove_punctuation(sentence):
    return re.sub(r'[^\w\s]','',sentence)

#Tokenizza la frase in input e ne affettua anche la lemmatizzazione della sue parole
def tokenize_sentence(sentence):
    words_list = []
    lmtzr = WordNetLemmatizer()
    for tag in nltk.pos_tag(word_tokenize(sentence)):
        if (tag[1][:2] == "NN"):
            words_list.append(lmtzr.lemmatize(tag[0], pos = wn.NOUN))
        elif (tag[1][:2] == "VB"):
             words_list.append(lmtzr.lemmatize(tag[0], pos = wn.VERB))
        elif (tag[1][:2] == "RB"):
             words_list.append(lmtzr.lemmatize(tag[0], pos = wn.ADV))
        elif (tag[1][:2] == "JJ"):
             words_list.append(lmtzr.lemmatize(tag[0], pos = wn.ADJ))
    return words_list

#Restituisce la l'insieme di stopwords dal file delle stopwords
def get_stopwords():
    stopwords = open("stop_words_FULL.txt", "r")
    stopwords_list = []
    for word in stopwords:
        stopwords_list.append(word.replace('\n', ''))
    stopwords.close()
    return stopwords_list

FUNZIONI UTILI PER IL RITROVAMENTO DEI FRAME

In [10]:
#Struttura dati che contiene i contesti associati al frame, ai suoi frame elements e alle sue lexical units
class ContextsFrame:
    
    def __init__(self, frame_id, frame_name, frame_context,frame_elements_contexts, lexical_units_contexts):
        self.frame_id = frame_id
        self.frame_name = frame_name
        self.frame_context = frame_context
        #dizionario: [frame_element_name] -> context
        self.frame_elements_contexts = frame_elements_contexts
        #dizionario: [lexical_unit_name] -> context
        self.lexical_units_contexts = lexical_units_contexts
        
    def get_frame_elements_contexts(self):
        return self.frame_elements_contexts
    def get_lexical_units_contexts(self):
        return self.lexical_units_contexts
    def get_frame_id(self):
        return self.frame_id
    def get_frame_name(self):
        return self.frame_name
    def get_frame_context(self):
        return self.frame_context
    #stampa fatta al solo scopo di conoscere le struttura interna della classe
    def printContextsFrame(self):
        print("FRAME ID: ",self.get_frame_id())
        print("FRAME NAME: ",self.get_frame_name())
        print("\nFRAME CONTEXT: ",self.get_frame_context())
        print("\nFRAME ELEMENTS CONTEXTS: ",self.get_frame_elements_contexts())
        print("\nLEXICAL UNITS CONTEXTS: ",self.get_lexical_units_contexts())
        print("------------------------------")
        
#Risultati di WordNet.
#Struttura che contiene i synset associati al frame in questione, ai suoi frame elements e alle sue lexical_units
#Questi risultati dovranno poi essere confrontati con annotazioni fatte degli umani
#quindi ogni annotazione per ogni frame dovrà essere di questo tipo e.s SynsetsFrameAnnotation
class SynsetsFrame:
    def __init__(self, frame_id, frame_name, frame_synset,frame_elements_synsets, lexical_units_synsets):
        self.frame_id = frame_id
        self.frame_name = frame_name
        self.frame_synset = frame_synset
        self.frame_elements_synsets = frame_elements_synsets
        self.lexical_units_synsets = lexical_units_synsets
    def get_frame_elements_synsets(self):
        return self.frame_elements_synsets
    def get_lexical_units_synsets(self):
        return self.lexical_units_synsets
    def get_frame_id(self):
        return self.frame_id
    def get_frame_name(self):
        return self.frame_name
    def get_frame_synset(self):
        return self.frame_synset
    #stampa fatta al solo scopo di conoscere le struttura interna della classe
    def printSynsetsFrame(self):
        print("FRAME ID: ",self.get_frame_id())
        print("FRAME NAME: ",self.get_frame_name())
        print("FRAME SYNSET: ",self.get_frame_synset())
        print("FRAME ELEMENTS SYNSETS: ",self.get_frame_elements_synsets())
        print("LEXICAL UNITS SYNSETS: ",self.get_lexical_units_synsets())
        print("------------------------------")

def print_frames_with_IDs():
    for x in fn.frames():
        print('{}\t{}'.format(x.ID, x.name))

def get_frams_IDs():
    return [f.ID for f in fn.frames()]   

#restituisce un insieme di 5 frame legati allo cognome dello studente
def getFrameSetForStudent(surname, list_len=5):
    nof_frames = len(fn.frames())
    base_idx = (abs(int(hashlib.sha512(surname.encode('utf-8')).hexdigest(), 16)) % nof_frames)
    print('COGNOME STUDENTE: ' + surname + "\n")
    framenet_IDs = get_frams_IDs()
    i = 0
    offset = 0 
    seed(1)
    frame_list = []
    while i < list_len:
        fID = framenet_IDs[(base_idx+offset)%nof_frames]
        f = fn.frame(fID)
        frame_list.append(fn.frame(fID))
        fNAME = f.name
        #print('\tID: {a:4d}\tframe: {framename}'.format(a=fID, framename=fNAME))
        offset = randint(0, nof_frames)
        i += 1
    return frame_list

#Prende in input un frame di FrameNet e restituisce il suo contesto formato 
#da tutte le definizioni dei suoi frame element, lexical unit e del frame stesso
def context_for_frame(frame):
    context_frame = set()
    context_frame.update(pre_processing(frame.definition))
    
    FEs = frame.FE.keys()
    for fe in FEs:
        fed = frame.FE[fe]
        context_frame.update(pre_processing(fed.definition))
    
    LUs = frame.lexUnit.keys()
    for lu in LUs:
        lud = frame.lexUnit[lu]
        context_frame.update(pre_processing(lud.definition))
    
    return context_frame

#Restituisce il contesto per un frame component (frame element o lexical unit)
#che è praticamente formato dalla sua definizione, lemmatizzata,
#da cui sono state rimosse le stop words e punteggiatura
def context_for_frame_component(frame_component):
    context_frame_component = set()
    context_frame_component.update(pre_processing(frame_component.definition))
    
    return context_frame_component

#Prende in input un senso di WordNet e restituisce il suo contesto
#formato da tutte le definizioni ed esempi dei suoi iperonimi, iponimi
#e definizione ed esempi del senso stesso
def context_for_sense(sense):
    context_sense = set()

    context_sense.update(pre_processing(sense.definition()))
    for example in sense.examples():
        context_sense.update(pre_processing(example))
    
    for hypernym in sense.hypernyms():
        context_sense.update(pre_processing(hypernym.definition()))
        for example in hypernym.examples():
            context_sense.update(pre_processing(example))
            
    for hyponym in sense.hyponyms():
        context_sense.update(pre_processing(hyponym.definition()))
        for example in hyponym.examples():
            context_sense.update(pre_processing(example))
    return context_sense

#Riceve in input una frase semplice(costituita da circa due parole separate da _)
#e restituisce in output il reggente della frase
def get_regent(sentence):
    if sentence == 'Sleep':
        return 'Sleep'
    elif sentence == 'Be_on_alert':
        return 'alert'
    elif sentence == 'Rising_to_a_challenge':
        return 'challenge'
    elif sentence == 'Use_vehicle':
        return 'vehicle'
    elif sentence == 'Deciding':
        return 'Deciding'

#Le unità lessicali ricavate da FrameNet per un determinato frame sono nella forma <ul>.PoS (esempio: before.prep)
#pertanto, per poter individuare il senso o i sensi corrispondenti all'unità lessicale, è importante rimuovere
#il punto e il PoS che segue
def remove_pos_lu(lexical_unit_name):
    new_lexical_unit_name = lexical_unit_name.split(".")[0]
    #Remove [...] from lexical_unit_name
    return new_lexical_unit_name.split(" [")[0]

ANNOTAZIONE DEI 5 FRAME ESTRATTI IN BASE AL COGNOME

In [11]:
SURNAME = 'Parisi'

"""
student: Parisi
	ID:  264	frame: Sleep
	ID: 2723	frame: Be_on_alert
	ID: 1441	frame: Rising_to_a_challenge
	ID: 1690	frame: Use_vehicle
	ID:  363	frame: Deciding
"""

def get_synsets_frames_list_annotations():
    
    synsets_frames_list_annotations = []
    
    #FRAME 1
    frame_id_1 = 264
    frame_name_1 = 'Sleep'
    
    #Frame_name
    frame_synset_1 = wn.synset('sleep.v.01')
    
    #FEs
    frame_elements_synsets_1 = dict()
    frame_elements_synsets_1['Sleeper'] = wn.synset('sleeper.n.01')
    frame_elements_synsets_1['Degree'] = wn.synset('degree.n.05')
    frame_elements_synsets_1['Duration'] = wn.synset('duration.n.03')
    frame_elements_synsets_1['Manner'] = wn.synset('manner.n.01')

    #LUs
    lexical_units_synsets_1 = dict()
    lexical_units_synsets_1['asleep.n'] = wn.synset('sleep.n.01')
    lexical_units_synsets_1['slumber.v'] = wn.synset('sleep.v.01')
    lexical_units_synsets_1['snooze.n'] = wn.synset('nap.n.04')
    lexical_units_synsets_1['snooze.v'] = wn.synset('catnap.v.01')
    lexical_units_synsets_1['catnap.n'] = wn.synset('catnap.n.01')
    lexical_units_synsets_1['doze.n'] = wn.synset('doze.n.01')

    synsets_frame_1 = SynsetsFrame(frame_id_1, frame_name_1, frame_synset_1, frame_elements_synsets_1, lexical_units_synsets_1)
    
    synsets_frames_list_annotations.append(synsets_frame_1)
    
    #FRAME 2
    frame_id_2 = 2723
    frame_name_2 = "Be_on_alert"
    
    #Frame_name
    frame_synset_2 = wn.synset('alert.n.01')
    
    #FEs
    frame_elements_synsets_2 = dict()
    frame_elements_synsets_2['Activity'] = wn.synset('activity.n.01')
    frame_elements_synsets_2['Danger'] = wn.synset('danger.n.01')
    frame_elements_synsets_2['Protagonist'] = wn.synset('protagonist.n.01')
    frame_elements_synsets_2['Time'] = wn.synset('time.n.06')
    frame_elements_synsets_2['Degree'] = wn.synset('degree.n.05')
    frame_elements_synsets_2['Duration'] = wn.synset('duration.n.03')
    
    #LUs
    lexical_units_synsets_2 = dict()
    lexical_units_synsets_2['alert.n'] = wn.synset('sleep.n.01')
    lexical_units_synsets_2['guard.n'] = wn.synset('sleep.n.01')
    
    synsets_frame_2 = SynsetsFrame(frame_id_2, frame_name_2, frame_synset_2, frame_elements_synsets_2, lexical_units_synsets_2)
    
    synsets_frames_list_annotations.append(synsets_frame_2)
    
    #FRAME 3
    frame_id_3 = 1441
    frame_name_3 = "Rising_to_a_challenge"
    
    #Frame_name
    frame_synset_3 = wn.synset('challenge.n.01')
    
    #FEs
    frame_elements_synsets_3 = dict()
    frame_elements_synsets_3['Activity'] = wn.synset('activity.n.01')
    frame_elements_synsets_3['Protagonist'] = wn.synset('protagonist.n.01')
    frame_elements_synsets_3['Degree'] = wn.synset('degree.n.02')
    frame_elements_synsets_3['Explanation'] = wn.synset('explanation.n.01')
    frame_elements_synsets_3['Place'] = wn.synset('place.n.06')
    frame_elements_synsets_3['Circumstances'] = wn.synset('circumstances.n.01')
    
    #LUs
    lexical_units_synsets_3 = dict()
    lexical_units_synsets_3['rise to the occasion.v'] = wn.synset('rise.v.06')
    lexical_units_synsets_3['rise.v'] = wn.synset('rise.v.06')

    
    synsets_frame_3 = SynsetsFrame(frame_id_3, frame_name_3, frame_synset_3, frame_elements_synsets_3, lexical_units_synsets_3)
    
    synsets_frames_list_annotations.append(synsets_frame_3)
    
    #FRAME 4
    frame_id_4 = 1690
    frame_name_4 = 'Use_vehicle'
    
    #Frame_name
    frame_synset_4 = wn.synset('vehicle.n.01')
    
    #FEs
    frame_elements_synsets_4 = dict()
    frame_elements_synsets_4['Area'] = wn.synset('area.n.01')
    frame_elements_synsets_4['Driver'] = wn.synset('driver.n.01')
    frame_elements_synsets_4['Goal'] = wn.synset('goal.n.01')
    frame_elements_synsets_4['Path'] = wn.synset('path.n.01')
    frame_elements_synsets_4['Source'] = wn.synset('source.n.01')
    frame_elements_synsets_4['Theme'] = wn.synset('theme.n.01')
    frame_elements_synsets_4['Vehicle'] = wn.synset('vehicle.n.01')
    frame_elements_synsets_4['Road'] = wn.synset('road.n.01')

    #LUs
    lexical_units_synsets_4 = dict()
    
    synsets_frame_4 = SynsetsFrame(frame_id_4, frame_name_4, frame_synset_4, frame_elements_synsets_4, lexical_units_synsets_4)
    
    synsets_frames_list_annotations.append(synsets_frame_4)
    
    #FRAME 5
    frame_id_5 = 363
    frame_name_5 = 'Deciding'
    
    #Frame_name
    frame_synset_5 = wn.synset('decide.v.01')
    
    #FEs
    frame_elements_synsets_5 = dict()
    frame_elements_synsets_5['Cognizer'] = wn.synset('decision.n.01')
    frame_elements_synsets_5['Decision'] = wn.synset('decision.n.01')
    frame_elements_synsets_5['Circumstance'] = wn.synset('circumstances.n.01')
    frame_elements_synsets_5['Explanation'] = wn.synset('explanation.n.01')
    frame_elements_synsets_5['Inherent_purpose'] = wn.synset('purpose.n.03')
    frame_elements_synsets_5['Manner'] = wn.synset('manner.n.02')
    frame_elements_synsets_5['Place'] = wn.synset('place.n.01')

    #LUs
    lexical_units_synsets_5 = dict()
    lexical_units_synsets_5['decide.v'] = wn.synset('decide.v.01')
    lexical_units_synsets_5['decision.n'] = wn.synset('decision.n.01')
    lexical_units_synsets_5['determine.v'] = wn.synset('determine.v.01')

    synsets_frame_5 = SynsetsFrame(frame_id_5, frame_name_5, frame_synset_5, frame_elements_synsets_5, lexical_units_synsets_5)
    
    synsets_frames_list_annotations.append(synsets_frame_5)
    
    return synsets_frames_list_annotations

In [12]:
#Restituisce un oggetto SynsetsFrame della lista synsets_frames_list_annotations
#con lo stesso frame_id dell'oggetto SynsetsFrame synsets_frame
def get_synsets_frame_annotations(synsets_frame,synsets_frames_list_annotations):
    for synsets_frame_annotations in synsets_frames_list_annotations:
        if synsets_frame_annotations.get_frame_id() == synsets_frame.get_frame_id():
            return synsets_frame_annotations

def total_accuracy(synsets_frames_list, synsets_frames_list_annotations):
    evaluated = 0
    checked = 0
    for synsets_frame in synsets_frames_list:
        #prendo l'oggetto SynsetsFrame corrispondente
        synsets_frame_annotations = get_synsets_frame_annotations(synsets_frame, synsets_frames_list_annotations)
        
        #check frame
        evaluated = evaluated + 1
        if synsets_frame_annotations.get_frame_synset() == synsets_frame.get_frame_synset():
            checked = checked + 1
        
        #check frame elements
        frame_elements_synsets_annotations = synsets_frame_annotations.get_frame_elements_synsets()
        frame_elements_synsets = synsets_frame.get_frame_elements_synsets()
        les_keys = frame_elements_synsets.keys()
        for key in les_keys:
            evaluated = evaluated + 1
            if frame_elements_synsets[key] == frame_elements_synsets_annotations[key]:
                checked = checked + 1
        
        #check lexical units
        lexical_units_synsets_annotations = synsets_frame_annotations.get_lexical_units_synsets()
        lexical_units_synsets = synsets_frame.get_lexical_units_synsets()
        lus_keys = lexical_units_synsets.keys()
        for key in lus_keys:
            evaluated = evaluated + 1
            if lexical_units_synsets[key] == lexical_units_synsets_annotations[key]:
                checked = checked + 1
                
    print("\nAccuratezza: ",format((checked/evaluated)*100,'.2f'),"%")

FUNZIONI PER LA VALUTAZIONE DEI RISULTATI

In [13]:
#Restituisce una lista di ContextsFrame
def get_contexts_frames_list(frames):
    contexts_frames_list = []
    for frame in frames:

        #dati riguardanti i frame elements e le lexical units del frame
        frame_elements_contexts = dict()
        lexical_units_contexts = dict()
        
        FEs = frame.FE.keys()
        for fe in FEs:
            fed = frame.FE[fe]
            frame_elements_contexts[fed.name] = context_for_frame_component(fed)
            
        LUs = frame.lexUnit.keys()
        for lu in LUs:
            lud = frame.lexUnit[lu]
            lexical_units_contexts[lud.name] = context_for_frame_component(lud)
        
        contextsFrame = ContextsFrame(frame.ID, frame.name, context_for_frame(frame), frame_elements_contexts, lexical_units_contexts)
        contexts_frames_list.append(contextsFrame) 
    return contexts_frames_list

#Restituisce un senso di WordNet per il wordnet_name(frame name, frame element name, lexical unit name) che massimizza lo score
def compute_score(wordnet_name, frameNet_context):
    synsets = wn.synsets(wordnet_name)
    if synsets == []: #se non ci sono synset disponibili
        return None
    #prende il synset con lo score più alto
    max_score = 0
    best_synset = synsets[0]
    for synset in synsets:
        synset_context = context_for_sense(synset)
        score = get_score(frameNet_context,synset_context)
        if score > max_score:
            max_score = score
            best_synset = synset
    return best_synset

def get_score(context1,context2):
    return len(context1.intersection(context2)) + 1
    
#Restituisce i risultati per quanto riguarda le associazioni di wordnet ai sensi di FrameNet
#quindi restituisce una lista di oggetti SynsetsFrame
#prende in input la lista di oggetti ContextsFrame che contiene i contesti dei
#frame elements, lexical untis, frame name associati ad un frame
def get_synsets_frames_list(contexts_frame_list):
    synsets_frames_list = []
    for contextsFrame in contexts_frame_list:
        frame_id = contextsFrame.get_frame_id()
        frame_name = contextsFrame.get_frame_name()
        frame_synset = compute_score(get_regent(frame_name),contextsFrame.get_frame_context())
        
        frame_elements_synsets = dict()
        lexical_units_synsets = dict()
        
        frame_elements_contexts = contextsFrame.get_frame_elements_contexts()
        for frame_element_name in frame_elements_contexts:
            #wordnet_name = get_regent(frame_element_name)
            wordnet_name = frame_element_name
            score = compute_score(wordnet_name, frame_elements_contexts[frame_element_name])
            if not score is None:
               frame_elements_synsets[frame_element_name] = score
        
        lexical_units_contexts = contextsFrame.get_lexical_units_contexts()
        for lexical_unit_name in lexical_units_contexts:
            wordnet_name = remove_pos_lu(lexical_unit_name)
            score = compute_score(wordnet_name, lexical_units_contexts[lexical_unit_name])
            if not score is None:
                lexical_units_synsets[lexical_unit_name] = score
    
        synsetsFrame = SynsetsFrame(frame_id, frame_name, frame_synset, frame_elements_synsets, lexical_units_synsets)
        synsets_frames_list.append(synsetsFrame)
    return synsets_frames_list

In [14]:
def main():
    
    #Lista di oggetti di tipo ContextsFrame(vedere ContextsFrame)
    contexts_frames_list = get_contexts_frames_list(getFrameSetForStudent(SURNAME))

    #Lista di oggetti di tipo SynsetsFrame (vedere Synsetsframe)
    synsets_frames_list = get_synsets_frames_list(contexts_frames_list)
    
    print("\nRISULTATI SISTEMA:\n ")
    for item in synsets_frames_list:
        item.printSynsetsFrame()
    
    #Lista di oggetti di tipo SynsetsFrame (ma riguarda le annotazioni umane)
    synsets_frames_list_annotations = get_synsets_frames_list_annotations()
    
    print("\n\nANNOTAZIONI UMANE:\n ")
    for item in synsets_frames_list_annotations:
        item.printSynsetsFrame()
    
    #testing
    total_accuracy(synsets_frames_list, synsets_frames_list_annotations)
main()

COGNOME STUDENTE: Parisi


RISULTATI SISTEMA:
 
FRAME ID:  264
FRAME NAME:  Sleep
FRAME SYNSET:  Synset('sleep.n.01')
FRAME ELEMENTS SYNSETS:  {'Sleeper': Synset('sleeper.n.01'), 'Duration': Synset('duration.n.01'), 'Time': Synset('time.n.01'), 'Place': Synset('topographic_point.n.01'), 'Degree': Synset('degree.n.01'), 'Manner': Synset('manner.n.01')}
LEXICAL UNITS SYNSETS:  {'nap.v': Synset('nap.v.01'), 'doze.v': Synset('snooze.v.01'), 'snooze.v': Synset('nap.n.04'), 'catnap.v': Synset('nap.n.04'), 'slumber.v': Synset('sleep.n.01'), 'hibernate.v': Synset('hibernate.v.01'), 'kip.v': Synset('kip.n.01'), 'drowse.v': Synset('drowse.v.02'), 'sleep.v': Synset('sleep.n.01'), 'doze.n': Synset('doze.n.01'), 'catnap.n': Synset('nap.n.04'), 'drowse.n': Synset('drowse.v.02'), 'hibernation.n': Synset('hibernation.n.01'), 'kip.n': Synset('kip.n.01'), 'nap.n': Synset('nap.n.04'), 'sleep [event].n': Synset('sleep.n.01'), 'slumber.n': Synset('sleep.n.01'), 'snooze.n': Synset('nap.n.04'), 'asleep.a': S

KeyError: 'Time'