In [41]:
import time
import enum
import math
import sys
import numpy as np
import pandas as pd
from io import open
from collections import Counter
from conllu import parse_incr
from IPython.display import display

1. FASE DI MODELLING: Viene fornito un modello formale del problema affrontato (già noto)

2. FASE DI LEARNING: Si cerca di capire come, dato un corpus, sia possibile settare i parametri in grado di generare il modello in grado di apprendere da un corpus.

   DEFINIZIONE DELLA MATRICE DI TRANSIZIONE E DELLA MATRICE DI EMISSIONE

In [42]:
def compute_transition_matrix(possible_tags, train):
    transition_matrix = np.zeros((len(possible_tags), len(possible_tags)), dtype='float32')
    
    transition_counter_dict = dict()
    counter_dict = dict()
    count_initial_dict = dict()
    count_final_dict = dict()

    #FASE 1
    for tag1 in possible_tags:
        counter_dict[tag1] = 0
        count_initial_dict[tag1] = 0
        for tag2 in possible_tags:
            transition_counter_dict[(tag1, tag2)] = 0

    #FASE 2
    sentence_n = 0
    for sentence in parse_incr(train):
        sentence_n += 1
        for i in range(len(sentence)):
            word_before = sentence[i-1]
            word = sentence[i]
            if i == 0:
                if word["upos"] in count_initial_dict.keys():
                    count_initial_dict[word["upos"]] = count_initial_dict[word["upos"]] + 1
            if (word_before["upos"], word["upos"]) in transition_counter_dict.keys() and i != 0:
                transition_counter_dict[(word_before["upos"], word["upos"])] = transition_counter_dict[(word_before["upos"], word["upos"])] + 1
            if word["upos"] in counter_dict.keys():
                counter_dict[word["upos"]] = counter_dict[word["upos"]] + 1
            if i == len(sentence) - 1:
                if (word["upos"], 'END') in transition_counter_dict.keys():
                    transition_counter_dict[(word["upos"], 'END')] = transition_counter_dict[(word_before["upos"], word["upos"])] + 1
    
    #FASE 3: Calcolo delle probabilità di emissione delle parole iniziali, intermedie e finale per ottenere delle performance migliori
    #probabilità di transizione iniziali
    for i,t in enumerate(possible_tags):
        transition_matrix[0][i] = count_initial_dict[t]/sentence_n
    #probabilità di transizione intermedie
    for i,t1 in enumerate(possible_tags):
        for j,t2 in enumerate(possible_tags):
            if i >= 1 and j >= 1 and i < (len(possible_tags) - 1):
                transition_matrix[i][j] =  transition_counter_dict[(t1,t2)]/counter_dict[t1]
    #probabilità di transizione finali
    #for i,t in enumerate(possible_tags):
    #    transition_matrix[i][len(possible_tags) - 1] = count_final_dict[t]/sentence_n
    train.seek(0)
    return transition_matrix

#una_tantum -> serializzare
def compute_emission_probabilities(train):
    word_tag_set = []
    tags_set = []
    words_set = []
    for sentence in parse_incr(train):
        for token in sentence:
            word_tag_set.append((token["form"],token["upos"]))
            tags_set.append(token["upos"])
            words_set.append(token["form"])
            
    count_word_tag = dict(Counter(word_tag_set))
    count_tags = dict(Counter(tags_set))
    count_word = dict(Counter(words_set))
    
    emission_dict = dict()
    for key in count_word_tag:
        emission_dict[(key[0],key[1])] = count_word_tag[key]/count_tags[key[1]]
    return emission_dict,count_word,count_word_tag

#una_tantum -> serializzare
def compute_oneshot_words_distributions(possible_tags, dev):
    word_tag_set = []
    word_set = []
    for sentence in parse_incr(dev):
        for token in sentence:
            word_tag_set.append((token["form"],token["upos"]))
            word_set.append(token["form"])
    word_tag = dict(word_tag_set)
    count_word = dict(Counter(word_set))
    one_shot_words_tag = []
    for word in [k for k,v in count_word.items() if float(v) == 1]:
        one_shot_words_tag.append((word,word_tag[word]))
    
    tags = []
    total_tags = 0
    for word,tag in one_shot_words_tag:
        tags.append(tag)
        total_tags = total_tags + 1
    distributions = []
    for key,count in dict(Counter(tags)).items():
        distributions.append((key,count/total_tags))
    for tag in possible_tags:
        if tag not in tags:
            distributions.append((tag,0))
    return distributions

3. FASE DI DECODING: Trovare l'algoritmo che permette di sfruttare al meglio i parametri appresi durante la fase di learning, per poter recuperare la soluzione ottimale dato un certo input.
   
   ALGORITMO DI VITERBI

In [43]:
def viterbi_algorithm(sentence_tokens, possible_tags, transition_matrix, emission_probabilities, count_word, smoothing_strategy, oneshot_words_tag_distribution):
    
    viterbi_matrix = np.zeros((len(possible_tags), len(sentence_tokens))) #matrice di viterbi
    backpointer = dict() #dizionario di dizionari
    
    #FASE 1: inizializzazione della prima colonna
    for s,tag in enumerate(possible_tags):
        transition_p = transition_matrix.loc['START',tag]
        emission_p = get_emission_p(emission_probabilities, sentence_tokens[0], tag, count_word, smoothing_strategy, oneshot_words_tag_distribution, possible_tags)
        
        if transition_p == 0 : transition_p = np.finfo(float).tiny
        if emission_p == 0 : emission_p = np.finfo(float).tiny
        
        viterbi_matrix[s,0] = math.log(transition_p) +  math.log(emission_p) 
        
    #FASE 2: Inizializzazione delle colonne intermedie
    #Si cicla prima sulle colonne e poi sulle righe 
    for t in range(1,len(sentence_tokens)):
        backpointer_column = dict()
        for s, tag in enumerate(possible_tags):
            max_ , backpointer_column[s] = get_max_argmax_value(possible_tags, viterbi_matrix, transition_matrix, t, s)
            emission_p = get_emission_p(emission_probabilities, sentence_tokens[t], tag, count_word, smoothing_strategy, oneshot_words_tag_distribution, possible_tags)
            if emission_p == 0: emission_p = np.finfo(float).tiny
            viterbi_matrix[s,t] = max_ + math.log(emission_p) 
        backpointer[t] = backpointer_column   
    
    #FASE 2: step finale (argmax)
    max_ = -sys.maxsize
    best_path_pointer = None
    for s,tag in enumerate(possible_tags):
        end_transition = transition_matrix.loc[tag,'END']
        if end_transition == 0: end_transition = np.finfo(float).tiny
        val = viterbi_matrix[s,len(sentence_tokens) - 1] + math.log(end_transition)
        if val >= max_: max_ = val ; best_path_pointer = s
    
    #FASE 3: backtracking
    #Recupero tramite backtracking della sequenza di PoS
    states = []
    states.append(best_path_pointer)
    t = len(sentence_tokens) - 1
    s = best_path_pointer
    while t >= 1:
        states.append(backpointer[t].get(s))
        s = backpointer[t].get(s)
        t = t -1
    
    #FASE 4: reverse PoS_Tag sequence
    pos_tags_sequence = []
    for state in list(reversed(states)): pos_tags_sequence.append(possible_tags[state])
    return pos_tags_sequence

FUNZIONI DI SUPPORTO

In [44]:

def get_max_argmax_value(possible_tags, viterbi_matrix, transition_matrix, t, s):
    max_ = -sys.maxsize
    argmax = None
    for s1, tag in enumerate(possible_tags):
        transition_p = transition_matrix.loc[tag,possible_tags[s]]
        if transition_p == 0 : transition_p = np.finfo(float).tiny
        val = viterbi_matrix[s1, t-1] + math.log(transition_p)
        if val >= max_: max_ = val; argmax = s1
    return max_, argmax

def get_emission_p(emission_probabilities, word, tag, count_word, smoothing_strategy, oneshot_words_tag_distribution, possible_tags):
    emission_p = 0
    try:
        count_word[word]
    except KeyError: #unknown_word
        emission_p = unknown_word_emission_p(smoothing_strategy, tag, possible_tags, oneshot_words_tag_distribution)         
        return emission_p
    try:
        emission_p = emission_probabilities[(word,tag)]
    except KeyError: #tag never emitted word
        emission_p = 0
    return emission_p

FUNZIONI DI SMOOTHING

In [45]:

def unknown_word_emission_p(smoothing_strategy,tag,possible_tags,oneshot_words_tag_distribution):
    emission_p = 0
    if smoothing_strategy.name == 'UNKNOWN_NAME':
        if tag == 'NOUN': 
            emission_p = 1
    if smoothing_strategy.name == 'UNKNOWN_NAME_VERB':
        if tag == 'NOUN' or tag == 'VERB': 
            emission_p = 0.5
    if smoothing_strategy.name == 'UNKNOWN_TAG': 
        emission_p = 1/len(possible_tags)
    if smoothing_strategy.name == 'UNKNOWN_DEV': 
        emission_p = get_prob(tag, oneshot_words_tag_distribution)
    return emission_p

def get_prob(tag,oneshot_words_tag_distribution):
    for tag_p,prob in oneshot_words_tag_distribution:
        if tag == tag_p:
            return prob

ALGORITMO DI VITERBI SUL GRECO

In [46]:
class Smoothing(enum.Enum):
    UNKNOWN_NAME = 1
    UNKNOWN_NAME_VERB = 2
    UNKNOWN_TAG = 3
    UNKNOWN_DEV = 4
    
class Language(enum.Enum):
    GREEK = 1
    LATIN = 2

start = ['START']

#scelta dello smoothing
#smoothing_strategy = Smoothing.UNKNOWN_NAME
#smoothing_strategy = Smoothing.UNKNOWN_NAME_VERB
#smoothing_strategy = Smoothing.UNKNOWN_TAG
smoothing_strategy = Smoothing.UNKNOWN_DEV

#scelta della lingua
language = Language.GREEK
language.name == 'GREEK'
train = open("Dataset/grc_perseus-ud-train.conllu", "r", encoding="utf-8")
dev = open("Dataset/grc_perseus-ud-dev.conllu","r", encoding="utf-8")
test = open("Dataset/grc_perseus-ud-test.conllu", "r", encoding="utf-8")
possible_tags = ['START','ADJ','ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON','SCONJ', 'VERB', 'X', 'PUNCT','END']

#learning
transition_matrix = pd.DataFrame(compute_transition_matrix(possible_tags, train), columns = list(possible_tags), index=list(possible_tags))
emission_probabilities, count_words, count_words_tag = compute_emission_probabilities(train)
train.close()
oneshot_words_tag_distribution = compute_oneshot_words_distributions(possible_tags, dev)
dev.close()

#rimuovo stato iniziale e finale perchè non servono più
possible_tags.remove('START')
possible_tags.remove('END')

#testing di tutte le sentence del test set
#Calcolo l'accuracy e i tempi di esecuzione dell'algoritmo di PoS tagging.
checked_words = 0
tested_words_n = 0
error_list = []
start = time.time()
for sentence in parse_incr(test):
    pos_token_list = [token["upos"] for token in sentence]            
    tested_words_n = tested_words_n + len(pos_token_list)
    sentence_tokens = [token["form"] for token in sentence]
    result_tags = viterbi_algorithm(sentence_tokens, possible_tags, transition_matrix, emission_probabilities, count_words, smoothing_strategy, oneshot_words_tag_distribution)
    for j in range(len(pos_token_list)):
        if pos_token_list[j] == result_tags[j]: checked_words = checked_words + 1
        else: error_list.append(pos_token_list[j])    
end = time.time()

#statistiche
print("Algoritmo: VITERBI")
print("Lingua: ", language.name)
print("Tipologia di smoothing: ",smoothing_strategy.name)
print("PoS Tag corretti: ", checked_words)
print("PoS Tag sbagliati: ", tested_words_n - checked_words)
print("Totale parole valutate: ",tested_words_n)
print("Errori per PoS Tag: ", dict(Counter(error_list)))
print("Accuratezza: ", format((checked_words/tested_words_n)*100,'.2f'),"%")
print("Tempo di esecuzione: ", format(end - start,'.2f')," sec")

Algoritmo: VITERBI
Lingua:  GREEK
Tipologia di smoothing:  UNKNOWN_DEV
PoS Tag corretti:  15982
PoS Tag sbagliati:  4977
Totale parole valutate:  20959
Errori per PoS Tag:  {'ADV': 1633, 'PRON': 494, 'VERB': 380, 'NOUN': 1172, 'ADJ': 968, 'CCONJ': 131, 'DET': 123, 'SCONJ': 49, 'ADP': 18, 'PUNCT': 4, 'NUM': 1, 'INTJ': 3, 'X': 1}
Accuratezza:  76.25 %
Tempo di esecuzione:  30.59  sec


ALGORITMO DI VITERBI SUL LATINO

In [47]:
class Smoothing(enum.Enum):
    UNKNOWN_NAME = 1
    UNKNOWN_NAME_VERB = 2
    UNKNOWN_TAG = 3
    UNKNOWN_DEV = 4
    
class Language(enum.Enum):
    GREEK = 1
    LATIN = 2

start = ['START']

#scelta dello smoothing
#smoothing_strategy = Smoothing.UNKNOWN_NAME
#smoothing_strategy = Smoothing.UNKNOWN_NAME_VERB
#smoothing_strategy = Smoothing.UNKNOWN_TAG
smoothing_strategy = Smoothing.UNKNOWN_DEV

#scelta della lingua
language = Language.LATIN
language.name == 'LATIN'
train = open("Dataset/la_llct-ud-train.conllu", "r", encoding="utf-8")
dev = open("Dataset/la_llct-ud-dev.conllu", "r", encoding="utf-8")
test = open("Dataset/la_llct-ud-test.conllu", "r", encoding="utf-8")
possible_tags = ['START','ADJ','ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON','PROPN','PUNCT', 'SCONJ', 'VERB','X','END']

#learning
transition_matrix = pd.DataFrame(compute_transition_matrix(possible_tags, train), columns = list(possible_tags), index=list(possible_tags))
emission_probabilities, count_words, count_words_tag = compute_emission_probabilities(train)
train.close()
oneshot_words_tag_distribution = compute_oneshot_words_distributions(possible_tags, dev)
dev.close()

#rimuovo stato iniziale e finale perchè non servono più
possible_tags.remove('START')
possible_tags.remove('END')

#testing di tutte le sentence del test set
#Calcolo l'accuracy e i tempi di esecuzione dell'algoritmo di PoS tagging.
checked_words = 0
tested_words_n = 0
error_list = []
start = time.time()
for sentence in parse_incr(test):
    pos_token_list = [token["upos"] for token in sentence]            
    tested_words_n = tested_words_n + len(pos_token_list)
    sentence_tokens = [token["form"] for token in sentence]
    result_tags = viterbi_algorithm(sentence_tokens, possible_tags, transition_matrix, emission_probabilities, count_words, smoothing_strategy, oneshot_words_tag_distribution)
    for j in range(len(pos_token_list)):
        if pos_token_list[j] == result_tags[j]: checked_words = checked_words + 1
        else: error_list.append(pos_token_list[j])    
end = time.time()

#statistics
print("Algoritmo: VITERBI")
print("Lingua: ", language.name)
print("Tipologia di smoothing: ",smoothing_strategy.name)
print("PoS Tag corretti: ", checked_words)
print("PoS Tag sbagliati: ", tested_words_n - checked_words)
print("Totale parole valutate: ",tested_words_n)
print("Errori per PoS Tag: ", dict(Counter(error_list)))
print("Accuratezza: ", format((checked_words/tested_words_n)*100,'.2f'),"%")
print("Tempo di esecuzione: ", format(end - start,'.2f')," sec")

Algoritmo: VITERBI
Lingua:  LATIN
Tipologia di smoothing:  UNKNOWN_DEV
PoS Tag corretti:  23409
PoS Tag sbagliati:  670
Totale parole valutate:  24079
Errori per PoS Tag:  {'VERB': 159, 'PROPN': 184, 'ADV': 49, 'NUM': 15, 'PRON': 22, 'DET': 25, 'ADJ': 82, 'NOUN': 66, 'AUX': 29, 'CCONJ': 21, 'PUNCT': 4, 'SCONJ': 12, 'ADP': 2}
Accuratezza:  97.22 %
Tempo di esecuzione:  40.92  sec


ALGORITMO DI BASELINE SUL GRECO

In [48]:
class Language(enum.Enum):
    GREEK = 1
    LATIN = 2

#scelta della lingua
language = Language.GREEK
language.name == 'GREEK'
train = open("Dataset/grc_perseus-ud-train.conllu", "r", encoding="utf-8")
test = open("Dataset/grc_perseus-ud-test.conllu", "r", encoding="utf-8")
possible_tags = ['ADJ','ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON','SCONJ', 'VERB', 'X', 'PUNCT']

count_words_tag = compute_emission_probabilities(train)[2]

def baseline_algorithm(sentence_tokens,count_words_tag,possible_tags):
    tags = []
    for word in sentence_tokens:
        tag_max = 'NOUN'
        count_max_tag = 0
        for tag in possible_tags:
            if count_words_tag.get((word,tag),0) > count_max_tag:
                count_max_tag = count_words_tag[word,tag]
                tag_max = tag    
        tags.append(tag_max)
    return tags

#testing di tutte le sentence del test set
#Calcolo l'accuracy e i tempi di esecuzione dell'algoritmo di PoS tagging.
checked_words = 0
tested_words_n = 0
error_list = []
start = time.time()
for sentence in parse_incr(test):
    pos_token_list = [token["upos"] for token in sentence]
    sentence_tokens = [token["form"] for token in sentence]
    tested_words_n = tested_words_n + len(pos_token_list)
    result_tags = baseline_algorithm(sentence_tokens, count_words_tag, possible_tags)
    for j in range(len(pos_token_list)):
        if pos_token_list[j] == result_tags[j]:
            checked_words = checked_words + 1     
        else:
            error_list.append(pos_token_list[j])
end = time.time()
test.close()

print("Algoritmo: BASELINE")
print("Lingua: ", language.name)
print("PoS Tag corretti: ", checked_words)
print("PoS Tag sbagliati: ", tested_words_n - checked_words)
print("Totale parole valutate: ",tested_words_n)
print("Errori per PoS Tag: ", dict(Counter(error_list)))
print("Accuratezza: ", format((checked_words/tested_words_n)*100,'.2f'),"%")
print("Tempo di esecuzione: ", format(end - start,'.2f')," sec")

Algoritmo: BASELINE
Lingua:  GREEK
PoS Tag corretti:  15411
PoS Tag sbagliati:  5548
Totale parole valutate:  20959
Errori per PoS Tag:  {'VERB': 1978, 'ADV': 1823, 'PRON': 462, 'ADJ': 965, 'CCONJ': 133, 'DET': 88, 'SCONJ': 25, 'NOUN': 50, 'ADP': 16, 'NUM': 1, 'PUNCT': 3, 'INTJ': 3, 'X': 1}
Accuratezza:  73.53 %
Tempo di esecuzione:  0.33  sec


ALGORITMO DI BASELINE SUL LATINO

In [49]:
class Language(enum.Enum):
    GREEK = 1
    LATIN = 2

#scelta della lingua
language = Language.LATIN
language.name == 'LATIN'
train = open("Dataset/la_llct-ud-train.conllu", "r", encoding="utf-8")
test = open("Dataset/la_llct-ud-test.conllu", "r", encoding="utf-8")
possible_tags = ['ADJ','ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON','PROPN', 'PUNCT', 'SCONJ', 'VERB', 'X']

count_words_tag = compute_emission_probabilities(train)[2]

def baseline_algorithm(sentence_tokens,count_words_tag,possible_tags):
    tags = []
    for word in sentence_tokens:
        tag_max = 'NOUN'
        count_max_tag = 0
        for tag in possible_tags:
            if count_words_tag.get((word,tag),0) > count_max_tag:
                count_max_tag = count_words_tag[word,tag]
                tag_max = tag    
        tags.append(tag_max)
    return tags

#testing di tutte le sentence del test set
#Calcolo l'accuracy e i tempi di esecuzione dell'algoritmo di PoS tagging.
checked_words = 0
tested_words_n = 0
error_list = []
start = time.time()
for sentence in parse_incr(test):
    pos_token_list = [token["upos"] for token in sentence]
    sentence_tokens = [token["form"] for token in sentence]
    tested_words_n = tested_words_n + len(pos_token_list)
    result_tags = baseline_algorithm(sentence_tokens, count_words_tag, possible_tags)
    for j in range(len(pos_token_list)):
        if pos_token_list[j] == result_tags[j]:
            checked_words = checked_words + 1     
        else:
            error_list.append(pos_token_list[j])
end = time.time()
test.close()

print("Algoritmo: BASELINE")
print("Lingua: ", language.name)
print("PoS Tag corretti: ", checked_words)
print("PoS Tag sbagliati: ", tested_words_n - checked_words)
print("Totale parole valutate: ",tested_words_n)
print("Errori per PoS Tag: ", dict(Counter(error_list)))
print("Accuratezza: ", format((checked_words/tested_words_n)*100,'.2f'),"%")
print("Tempo di esecuzione: ", format(end - start,'.2f')," sec")

Algoritmo: BASELINE
Lingua:  LATIN
PoS Tag corretti:  22969
PoS Tag sbagliati:  1110
Totale parole valutate:  24079
Errori per PoS Tag:  {'VERB': 248, 'PROPN': 471, 'ADV': 56, 'DET': 142, 'NUM': 34, 'ADJ': 83, 'NOUN': 15, 'CCONJ': 35, 'ADP': 6, 'SCONJ': 15, 'AUX': 3, 'PRON': 2}
Accuratezza:  95.39 %
Tempo di esecuzione:  0.37  sec


In [50]:
class Smoothing(enum.Enum):
    UNKNOWN_NAME = 1
    UNKNOWN_NAME_VERB = 2
    UNKNOWN_TAG = 3
    UNKNOWN_DEV = 4
    
class Language(enum.Enum):
    GREEK = 1
    LATIN = 2
    
start = ['START']

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

smoothing_strategy = Smoothing.UNKNOWN_DEV
language = Language.LATIN
#language = Language.GREEK

if language.name == 'GREEK':
    train = open("Dataset/grc_perseus-ud-train.conllu", "r", encoding="utf-8")
    dev = open("Dataset/grc_perseus-ud-dev.conllu","r", encoding="utf-8")
    test = open("Dataset/grc_perseus-ud-test.conllu", "r", encoding="utf-8")
    possible_tags = ['START','ADJ','ADP', 'ADV', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM', 'PART', 'PRON','SCONJ', 'VERB', 'X', 'PUNCT','END']
elif language.name == 'LATIN':
    train = open("Dataset/la_llct-ud-train.conllu", "r", encoding="utf-8")
    dev = open("Dataset/la_llct-ud-dev.conllu", "r", encoding="utf-8")
    test = open("Dataset/la_llct-ud-test.conllu", "r", encoding="utf-8")
    possible_tags = ['START','ADJ','ADP', 'ADV', 'AUX', 'CCONJ', 'DET', 'NOUN', 'NUM', 'PART', 'PRON','PROPN','PUNCT', 'SCONJ', 'VERB','X','END']

transition_matrix = pd.DataFrame(compute_transition_matrix(possible_tags, train), columns = list(possible_tags), index=list(possible_tags))
display(transition_matrix)

Unnamed: 0,START,ADJ,ADP,ADV,AUX,CCONJ,DET,NOUN,NUM,PART,PRON,PROPN,PUNCT,SCONJ,VERB,X,END
START,0.0,0.046508,0.022911,0.070243,0.000412,0.191384,0.034984,0.139388,0.001372,0.000549,0.025792,0.009878,0.378241,0.003841,0.074359,0.000137,0.0
ADJ,0.0,0.089186,0.022273,0.005023,0.034594,0.047578,0.006729,0.283196,0.000758,0.000284,0.008435,0.213819,0.252867,0.000569,0.034499,0.00019,0.0
ADP,0.0,0.090429,0.002025,0.008323,0.0,0.0,0.244686,0.353729,0.006355,0.0,0.105331,0.147677,0.000562,0.000169,0.040659,5.6e-05,0.0
ADV,0.0,0.011566,0.214399,0.048576,0.017782,0.013879,0.043516,0.080382,0.00506,0.029059,0.061009,0.129536,0.032529,0.010843,0.30172,0.000145,0.0
AUX,0.0,0.089726,0.109915,0.013459,0.0,0.026021,0.018394,0.061014,0.001795,0.000449,0.154329,0.014805,0.275011,0.005384,0.229699,0.0,0.0
CCONJ,0.0,0.05447,0.157828,0.050239,0.004051,0.007563,0.089943,0.263798,0.029621,0.010354,0.037364,0.05456,0.004952,0.059242,0.175925,9e-05,0.0
DET,0.0,0.061589,0.056543,0.076478,0.001415,0.024611,0.055128,0.447487,0.007629,0.000554,0.033225,0.076724,0.060604,0.014705,0.083308,0.0,0.0
NOUN,0.0,0.101177,0.092977,0.023564,0.007742,0.076263,0.151512,0.082509,0.018716,0.000892,0.027326,0.077879,0.180141,0.007091,0.151778,0.000434,0.0
NUM,0.0,0.015735,0.189908,0.034183,0.003256,0.051546,0.010309,0.421595,0.022789,0.0,0.003798,0.000543,0.185567,0.001085,0.059685,0.0,0.0
PART,0.0,0.0,0.015009,0.011257,0.223265,0.043152,0.001876,0.071295,0.0,0.0,0.009381,0.001876,0.0,0.0,0.622889,0.0,0.0
