#### Authors: 
- Vali Florinel Craciun
- Ilaria Salvatori

In [2]:
# how to run the code:
# we first defined the estimate...() functions 
# each estimate...() function will return the internal state representation, to be then used as inputs for non estimate functions




def import_corpus(path_to_file): 
    sentences = []
    sentence = []
    f = open(path_to_file)
    
    while True:
        line = f.readline()
        if not line: break  #Checks if the variable line is empty, which would indicate the end of the file. 
        #If the condition is true, it breaks out of the loop using the break statement.
            
        line = line.strip()
        line = line.lower() # i added this 
        if len(line) == 0:
            sentences.append(sentence)    
            sentence = []
            continue
                
        parts = line.split(' ')
        sentence.append((parts[0], parts[-1]))
        
    f.close()        
    return sentences 


In [14]:
#some useful functions

def get_one_words(corpus): # to get th list wof words with frequency=1
    freqs={}
    one_words=set()
    for sentence in corpus:
        for word in sentence:
            if word[0] in freqs:
                freqs[word[0]]+=1
            else:
                freqs[word[0]]=1

    for k,v in freqs.items():
        if v==1:
            one_words.add(k)


    return one_words

def get_states(corpus): # to get the list of distinct states
    states=[]
    
    for sentence in corpus:
        for word in sentence:
            if word[1] not in states:
                states.append(word[1])
    
    return states 


def get_tokens(corpus): # to get the list of distinct tokens (i.e. words)

    tokens=set()
    for sentence in corpus:
        for word in sentence:
            tokens.add(word[0])
            
    return tokens 

def change_corpus(corpus,one_words): # to add the unknown words
    for sentence in corpus:
        for i,word in enumerate(sentence):
            if word[0] in one_words :
                sentence[i] = ('unknown', word[1])

    return corpus




In [13]:
# Exercise 1 ###################################################################
'''
Implement the probability distribution of the initial states.
Parameters:	state: string
            internal_representation: data structure representing the parameterization of this probability distribuion;
                this data structure is returned by the function estimate_initial_state_probabilities
Returns: float; initial probability of the given state
'''
def initial_state_probabilities(state, internal_representation):
    return internal_representation[state]
    
    
    
    
'''
Implement the matrix of transition probabilities.
Parameters:	from_state: string;
            to_state: string;
            internal_representation: data structure representing the parameterization of the matrix of transition probabilities;
                this data structure is returned by the function estimate_transition_probabilities
Returns: float; probability of transition from_state -> to_state
'''
def transition_probabilities(from_state, to_state, internal_representation):
    return internal_representation[f'{from_state} , {to_state}']
    
    
    
    
'''
Implement the matrix of emmision probabilities.
Parameters:	state: string;
            emission_symbol: string;
            internal_representation: data structure representing the parameterization of the matrix of emission probabilities;
                this data structure is returned by the function estimate_emission_probabilities
Returns: float; emission probability of the symbol emission_symbol if the current state is state
'''
def emission_probabilities(state, emission_symbol, internal_representation):
    return internal_representation[f'{state} , {emission_symbol}']
    
    
    
    
'''
Implement a function for estimating the parameters of the probability distribution of the initial states.
Parameters: corpus: list returned by the function import_corpus
Returns: data structure containing the parameters of the probability distribution of the initial states;
            use this data structure for the argument internal_representation of the function initial_state_probabilities
'''
def estimate_initial_state_probabilities(states,corpus):

    e=1e-5
    probs= {}

    for state in states:
        prob = {state:e} 
        count=0 # counts every start 

        for sentence in corpus:
            count+=1 # number of sentences 
            if sentence[0][1]==state: # check for the first state in each sentence
                prob[state]+=1
    
        
        prob[state] /= (count + e*len(states)) # now becomes a prob. 

        probs.update(prob) 


    return probs

    
    
'''
Implement a function for estimating the parameters of the matrix of transition probabilities
Parameters: corpus: list returned by the function import_corpus
Returns: data structure containing the parameters of the matrix of transition probabilities;
            use this data structure for the argument internal_representation of the function transition_probabilities
'''
def estimate_transition_probabilities(states,corpus):
    e=1e-5
    probs={}
    for state1 in states:
        for state2 in states:
            from_freq=0 # counts how many times the from_state has a next state 
            joint_freq=0 # counts how many times the from_state has as to_state as a next state 

            for sentence in corpus:
                for i,word in enumerate(sentence[:-1]): # we ignore the last word/tag since doesn't have a next state 
                    if word[1]==state1:
                        from_freq+=1
                        if sentence[i+1][1]==state2:
                            joint_freq+=1

    
            prob={f'{state1} , {state2}':(joint_freq+e)/from_freq+e} # laplace smoothing

            probs.update(prob)

    return probs
 
    
    
    
    
'''
Implement a function for estimating the parameters of the matrix of emission probabilities
Parameters: corpus: list returned by the function import_corpus
Returns: data structure containing the parameters of the matrix of emission probabilities;
            use this data structure for the argument internal_representation of the function emission_probabilities
'''
def estimate_emission_probabilities(states,tokens,corpus):
    e = 1e-5
    
    probs={}

    for state in states:
        for token in tokens:
            state_count=0 # counts occurence of the state
            st_sy_count=0 # joint frequency of state and symbol 

            for sentence in corpus:
                for word in sentence:
                    if word[1]==state:
                        state_count+=1
                        if word[0]==token:
                            st_sy_count+=1


            prob={f'{state} , {token}':(st_sy_count+e)/(state_count+e)}

            probs.update(prob)
            
    
    return probs
    
    
    


In [5]:
corpus=import_corpus('corpus_ner.txt')

In [6]:
one_words=get_one_words(corpus)

In [7]:
corpus=change_corpus(corpus,one_words)

In [8]:
states=get_states(corpus)
tokens=get_tokens(corpus)

In [9]:
es_init_prob=estimate_initial_state_probabilities(states, corpus)

In [10]:
es_init_prob

{'o': 0.6713460772098555,
 'b-loc': 0.07984866587297254,
 'b-per': 0.11210673038621105,
 'b-misc': 0.051075268822583855,
 'i-misc': 9.956192750999544e-12,
 'i-per': 9.956192750999544e-12,
 'b-org': 0.08562325766855228,
 'i-loc': 9.956192750999544e-12,
 'i-org': 9.956192750999544e-12}

In [54]:
es_trans_prob=estimate_transition_probabilities(states, corpus)

In [55]:
es_trans_prob 

{'o , o': 0.9041882372661988,
 'o , b-loc': 0.02828099025380989,
 'o , b-per': 0.026011502259072467,
 'o , b-misc': 0.020518683489200745,
 'o , i-misc': 1.0000054818550597e-07,
 'o , i-per': 1.0000054818550597e-07,
 'o , b-org': 0.02100108673445894,
 'o , i-loc': 1.0000054818550597e-07,
 'o , i-org': 1.0000054818550597e-07,
 'b-loc , o': 0.8287958240855186,
 'b-loc , b-loc': 0.0006682142642391849,
 'b-loc , b-per': 1.000167028561884e-07,
 'b-loc , b-misc': 0.004342842625688993,
 'b-loc , i-misc': 1.000167028561884e-07,
 'b-loc , i-per': 1.000167028561884e-07,
 'b-loc , b-org': 0.0025055284449640885,
 'b-loc , i-loc': 0.16368809066310339,
 'b-loc , i-org': 1.000167028561884e-07,
 'b-per , o': 0.4495475354844116,
 'b-per , b-loc': 0.0006705659906134764,
 'b-per , b-per': 1.0001676164934629e-07,
 'b-per , b-misc': 1.0001676164934629e-07,
 'b-per , i-misc': 1.0001676164934629e-07,
 'b-per , i-per': 0.5497821985752597,
 'b-per , b-org': 1.0001676164934629e-07,
 'b-per , i-loc': 1.0001676164

In [None]:
es_emiss_prob=estimate_emission_probabilities(states,tokens, corpus)

In [None]:
es_emiss_prob

{'o , ltte': 5.196046844858488e-10,
 'o , raid': 2.0784706984118434e-05,
 'o , shaken': 3.117680067383541e-05,
 'o , kilos': 1.5588660139259947e-05,
 'o , maynard': 5.196046844858488e-10,
 'o , joins': 1.5588660139259947e-05,
 'o , ours': 1.0392613294401461e-05,
 'o , heads': 6.754912858784481e-05,
 'o , 15-30': 1.0392613294401461e-05,
 'o , joaquin': 5.196046844858488e-10,
 'b-loc , ltte': 1.6702855909422817e-08,
 'b-loc , raid': 1.6702855909422817e-08,
 'b-loc , shaken': 1.6702855909422817e-08,
 'b-loc , kilos': 1.6702855909422817e-08,
 'b-loc , maynard': 1.6702855909422817e-08,
 'b-loc , joins': 1.6702855909422817e-08,
 'b-loc , ours': 1.6702855909422817e-08,
 'b-loc , heads': 1.6702855909422817e-08,
 'b-loc , 15-30': 1.6702855909422817e-08,
 'b-loc , joaquin': 1.6702855909422817e-08,
 'b-per , ltte': 1.676164906534279e-08,
 'b-per , raid': 1.676164906534279e-08,
 'b-per , shaken': 1.676164906534279e-08,
 'b-per , kilos': 1.676164906534279e-08,
 'b-per , maynard': 1.676164906534279e

In [None]:
from math import log

In [None]:
  
# Exercise 2 ###################################################################
''''
Implement the Viterbi algorithm for computing the most likely state sequence given a sequence of observed symbols.
Parameters: observed_smbols: list of strings; the sequence of observed symbols
            initial_state_probabilities_parameters: data structure containing the parameters of the probability distribution of the initial states, returned by estimate_initial_state_probabilities
            transition_probabilities_parameters: data structure containing the parameters of the matrix of transition probabilities, returned by estimate_transition_probabilities
            emission_probabilities_parameters: data structure containing the parameters of the matrix of emission probabilities, returned by estimate_emission_probabilities
Returns: list of strings; the most likely state sequence
'''
def most_likely_state_sequence(states,tokens,observed_symbols, initial_state_probabilities_parameters, transition_probabilities_parameters, emission_probabilities_parameters):
    
    for i,word in enumerate(observed_symbols): # check which word of the test sentence is not in corpus  
        if word not in tokens:
            observed_symbols[i]='unknown' 

    
    
    column1={}
    max_states=[] 
    word=observed_symbols[0] # first word is treated separately since we don't have previous words and states to come from 

    for state in states:
        delta = (log(initial_state_probabilities(state, initial_state_probabilities_parameters)) 
        + log(emission_probabilities(state, word, emission_probabilities_parameters )))

        column1[state] = delta
    
    max_states.append(max(column1, key=column1.get)) # append the max state of the first column 

    for word in observed_symbols[1:]: 
        column2={}
        for state_i in states:
            prov_delta=[] # this is the current cell. This list will contain as many deltas as the number of distinct states
            for state_j in states:
                delta_ji = (column1[state_j]  +  
                        log(transition_probabilities(state_j,state_i, transition_probabilities_parameters)) + 
                        log(emission_probabilities(state_i, word, emission_probabilities_parameters)))
                
                prov_delta.append(delta_ji) 
            
            column2[state_i]= max(prov_delta) # max delta for the current cell
        
        max_states.append(max(column2, key=column2.get)) 

        column1=column2 # old column becomes the current column. The current column will become a new one in the next iteration

    
    return max_states

                






In [None]:
test=['hi', 'john', 'i', 'live', 'in', 'texas']

most_likely_state_sequence(states, tokens, test, es_init_prob, es_trans_prob, es_emiss_prob)
                           


['o', 'b-per', 'o', 'o', 'o', 'b-loc']