3. Визначте рівень згоди між анотувальниками (або inter-annotator agreement) у корпусі NUCLE Error Corpus. Деталі задачі:

- Для цієї задачі буде достатньо використати тільки тестову вибірку. <br/>
- Виправлення двох анотувальників вважаються однаковими, якщо збігається і оригінальний текст, і запропонована заміна. Придумайте спосіб враховувати виправлення, що збігаються частово. <br/>
- Оскільки анотувальників у тестовій вибірці є аж п'ятеро, потрібно виміряти рівень згоди попарно і усереднити це значення. <br/>
- Зауважте, що не всі анотувальники перевіряли кожне речення. Зверніть увагу, що відсутність виправлень анотувальники позначали як noop. <br/>
- Потрібно виміряти і загальний рівень згоди, і рівень згоди для кожного типу помилки окремо (див. документацію по корпусу). <br/>
- Напишіть ваші спостереження і висновки щодо якості анотування в корпусі та рівня згоди анотувальників.


Корпус NUCLE використовували на змаганні з виправлення помилок у 2014-му році. Звіт зі змагання можна прочитати на https://www.aclweb.org/anthology/W14-1701.pdf.

In [20]:
import pandas as pd
import uuid
from typing import Dict, List
from enum import IntEnum

In [2]:
!ls data

official-2014.combined-withalt.m2


In [3]:
test_file = "data/official-2014.combined-withalt.m2"

In [4]:
class ErrorType(IntEnum):
    Vt = 0,
    Vm = 1,
    V0 = 2,
    Vform = 3,
    SVA = 4,
    ArtOrDet = 5,
    Nn = 6,
    Npos = 7,
    Pform = 8,
    Pref = 9,
    Prep = 10,
    Wci = 11,
    Wa = 12,
    Wform = 13,
    Wtone = 14,
    Srun = 15,
    Smod = 16,
    Spar = 17,
    Sfrag = 18,
    Ssub = 19,
    WOinc = 20,
    WOadv = 21,
    Trans = 22,
    Mec = 23,
    Rloc = 24,
    Cit = 25,
    Others = 26,
    Um = 27,
    noop = 28

In [5]:
ErrorTypeMap = {
    'Vt' : ErrorType.Vt,
    'Vm' : ErrorType.Vm,
    'V0' : ErrorType.V0,
    'Vform' : ErrorType.Vform,
    'SVA' : ErrorType.SVA,
    'ArtOrDet' : ErrorType.ArtOrDet,
    'Nn' : ErrorType.Nn,
    'Npos' : ErrorType.Npos,
    'Pform' : ErrorType.Pform,
    'Pref' : ErrorType.Pref,
    'Prep' : ErrorType.Prep,
    'Wci' : ErrorType.Wci,
    'Wa' : ErrorType.Wa,
    'Wform' : ErrorType.Wform,
    'Wtone' : ErrorType.Wtone,
    'Srun' : ErrorType.Srun,
    'Smod' : ErrorType.Smod,
    'Spar' : ErrorType.Spar,
    'Sfrag' : ErrorType.Sfrag,
    'Ssub' : ErrorType.Ssub,
    'WOinc' : ErrorType.WOinc,
    'WOadv' : ErrorType.WOadv,
    'Trans' : ErrorType.Trans,
    'Mec' : ErrorType.Mec,
    'Rloc-' : ErrorType.Rloc,
    'Cit' : ErrorType.Cit,
    'Others' : ErrorType.Others,
    'Um' : ErrorType.Um,
    'noop' : ErrorType.noop
}

In [6]:
len(ErrorType)

29

In [7]:
class Correction:
    def __init__(self, start_pos: int, end_pos: int, err_type: str, corrected: str):
        self.start_pos:int = start_pos
        self.end_pos:int = end_pos
        self.err_type: ErrorType = err_type
        self.corrected: str = corrected
            
    def genereate_id(self):
        if self.start_pos == -1 or self.end_pos == -1:
            return -1
        id: int = int(f"{self.start_pos}{self.end_pos}")
        return id
        

In [8]:
class SentenceCorrection:
    def __init__(self, sent_id: int, sentence: str):
        self.sent_id: int = sent_id
        self.sentence: str = sentence
        self.corrections: Dict[int, Correction] = dict()    

In [9]:
class Annotator:
    def __init__(self, id: int):
        self.id = id
        self.sent_dict: Dict[int, SentenceCorrection] = dict()
            
    def add_sentence_correction(self, sent_correction:SentenceCorrection ):
        self.sent_dict[sent_correction.sent_id] = sent_correction

In [10]:
class Parser:
    
    @staticmethod
    def parse_annotaion_result(corpus_file):
        corpus = Parser.read_file(corpus_file)
        
        cur_sentence, cur_sent_id = None, None
        annotator_dict: Dict[int, Annotator] = dict()
        for i in range(0, len(corpus)):
            if corpus[i].startswith('S'):
                cur_sentence = corpus[i][2:-1]
                cur_sent_id = uuid.uuid4().__str__()
                continue
                
            if corpus[i].startswith('A'):
                splitted = corpus[i][2:-1].split('|||')
                annot_id = int(splitted[-1])
                
                if annot_id not in annotator_dict:
                    annotator_dict[annot_id] = Annotator(annot_id)
                
                
                correction: Correction = Parser.__create_sent_correction(splitted)
                
                if cur_sent_id not in annotator_dict[annot_id].sent_dict:
                    annotator_dict[annot_id].sent_dict[cur_sent_id] = SentenceCorrection(sent_id=cur_sent_id,
                                                                                        sentence=cur_sentence)
                
                correction_id = correction.genereate_id()
                annotator_dict[annot_id].sent_dict[cur_sent_id].corrections[correction_id] = correction
                
        return annotator_dict 
    
    
    @staticmethod
    def __create_sent_correction(annotatd_res: List[str]) -> Correction:
        start_pos = int(annotatd_res[0].split(' ')[0])
        end_pos = int(annotatd_res[0].split(' ')[1])
        
        err_type = ErrorTypeMap[annotatd_res[1]]
        corrected = annotatd_res[2]
                                
        return Correction(start_pos=start_pos, end_pos=end_pos, err_type=err_type, corrected=corrected)

    @staticmethod
    def read_file(file: str):
        with open(file) as f:
            lines = f.readlines()
        return lines

In [27]:
list(annotator_dict.keys())

[0, 1, 2, 3, 4]

In [74]:
class KappaCalculator:
    
    @staticmethod
    def calculate_coefficient_for_err_type(annotator_row: Annotator, annotator_col: Annotator, 
                              error_type: ErrorType, strong_with_corrected_value=False):
        
        dict_as_matrix = KappaCalculator.calculate_general_matrix(annotator_row, 
                                                                  annotator_col, 
                                                                  strong_with_corrected_value)
        
        total = KappaCalculator.__calc_total(dict_as_matrix)
        
        Pr_a_nominator = total - dict_as_matrix[error_type][error_type.value]
        Pr_a = Pr_a_nominator / total
        
        
        err_type_row_sum = sum(dict_as_matrix[error_type])
        err_type_col_sum = 0
        for key in dict_as_matrix:
            err_type_col_sum += dict_as_matrix[key][error_type.value]
        
        Pr_e_err_type = (err_type_row_sum / total) * (err_type_col_sum / total)
        
        other_err_type_row_sum = total - err_type_row_sum
        other_err_type_col_sum = total - err_type_col_sum
        Pr_e_other_err_type = (other_err_type_row_sum / total) * (other_err_type_col_sum / total)
        
        Pr_e = Pr_e_err_type + Pr_e_other_err_type
        
        k = KappaCalculator.__calc_kappa(Pr_a, Pr_e)
        
        return k
    
    
    @staticmethod
    def calculate_between_each_pair(annotator_dict: Dict[int, Annotator], strong_with_corrected_value:bool=False):
        ids = list(annotator_dict.keys())
        result = dict()
        sum_kappa = 0
        for i in range(0, len(ids) - 1):
            for j in range(i+1, len(ids)):
                id1, id2 = ids[i], ids[j]
                kappa = KappaCalculator.calculate_general_coeficient(annotator_row=annotator_dict[id1], 
                                                                     annotator_col=annotator_dict[id2],
                                                                    strong_with_corrected_value=strong_with_corrected_value)
                result[(id1, id2)] = kappa
                sum_kappa += kappa
        
        
        return result, sum_kappa / len(result)
                
                
                
    
    
    @staticmethod
    def calculate_general_coeficient(annotator_row: Annotator, 
                                     annotator_col: Annotator, 
                                     strong_with_corrected_value = False):
        dict_as_matrix = KappaCalculator.calculate_general_matrix(annotator_row, 
                                                                  annotator_col, 
                                                                  strong_with_corrected_value)
        
        total = KappaCalculator.__calc_total(dict_as_matrix)
        
        Pr_a_nominator = 0        
        row_sums, col_sums = len(ErrorType) * [0], len(ErrorType) * [0]
        for key in dict_as_matrix:
            Pr_a_nominator += dict_as_matrix[key][key.value]
            row_sums[key.value] += sum(dict_as_matrix[key])
            
            for i in range(0, len(dict_as_matrix[key])):
                col_sums[i] += dict_as_matrix[key][i]
        
        
        Pr_e_err_type = len(ErrorType) * [0]
        for i in range(0, len(ErrorType)):
            Pr_e_err_type[i] += (row_sums[i] / total) * ( [i] / total)
        
        Pr_a = Pr_a_nominator / total
        Pr_e = sum(Pr_e_err_type)
        
        k = KappaCalculator.__calc(Pr_a, Pr_e)
        
        return k
    
    @staticmethod
    def __calc_kappa(pr_a, pr_e):
        if pr_e == 1:
            return None
        k = (pr_a - pr_e) / (1 - pr_e)
        return k

                
    @staticmethod
    def calculate_general_matrix(annotator_row: Annotator, 
                                 annotator_col: Annotator, 
                                 strong_with_corrected_value = False):
        dict_as_matrix = KappaCalculator.create_dict_as_matrix()
        for sent_id in annotator_row.sent_dict:
            if sent_id not in annotator_col.sent_dict:
                continue
            
            corrections_row: Dict[int, Correction] = annotator_row.sent_dict[sent_id].corrections
            corrections_col: Dict[int, Correction] = annotator_col.sent_dict[sent_id].corrections
            
            for id in corrections_row:
                error_type_row = corrections_row[id].err_type
                if id not in corrections_col:
                    if id == -1: # noop
                        dict_as_matrix[error_type_row][ErrorType.Others.value] += 1
                    else:
                        dict_as_matrix[error_type_row][ErrorType.noop.value] += 1
                        
                    continue
                    
                error_type_col = corrections_col[id].err_type
                
                if error_type_row == error_type_col:
                    if corrections_row[id].corrected == corrections_col[id].corrected:
                        dict_as_matrix[error_type_row][error_type_col.value] += 1
                    else:
                        if strong_with_corrected_value:
                            dict_as_matrix[error_type_row][ErrorType.noop.value] += 1
                        else:
                            dict_as_matrix[error_type_row][error_type_col.value] += 1
                else:
                    dict_as_matrix[error_type_row][error_type_col.value] += 1
        
        return dict_as_matrix
    
    
    @staticmethod
    def calculate_general_matrix_df(annotator_row: Annotator, 
                                    annotator_col: Annotator, 
                                    strong_with_corrected_value = False):
        dict_as_matrix = KappaCalculator.calculate_general_matrix(annotator_row, 
                                                                  annotator_col, 
                                                                  strong_with_corrected_value)
        df = pd.DataFrame(dict_as_matrix, index=list(ErrorTypeMap.keys()))
        df.columns = list(ErrorTypeMap.keys())
        
        return df
    
    @staticmethod
    def __calc_total(dict_as_matrix):
        total = 0
        for key in dict_as_matrix:
            total += sum(dict_as_matrix[key])
        return total

    
    @staticmethod
    def create_dict_as_matrix():
        result = dict()
        size = len(ErrorType)
        for key in ErrorType:
            result[key] = size * [0]
        return result
            

In [18]:
annotator_dict = Parser.parse_annotaion_result(corpus_file=test_file)

### general Cohen's Kappa coeficient

One of the method to calculate general inter-annotation agreement based on error types (Wci, Wa, Wtone, etc). First of all, we propose to build the matrix between two annotators which shows agreement of error types. In this case, we compare whether agrement for error type is coincided or not at the same position in the sentence. For instance let's consider the next sentence: <br/>

S On one hand , we do not want this potential danger causing firghtenning affects in our families ' later lives .


A 11 12|||Wci|||having|||REQUIRED|||-NONE-|||0 <br/>
A 11 12|||Wci|||having|||REQUIRED|||-NONE-|||2

In this case we notice the correction for error type Wci is coincided betwen annotator 0 and annotator 2 for the same postion 11 12. If position is differs, as an example: 


A 11 12|||Wci|||having|||REQUIRED|||-NONE-|||0 <br/>
A 12 13|||Wci|||having|||REQUIRED|||-NONE-|||2

we consider that annotator 0 at position 11 12 mention Wci error but annotator 2 mention noop error type and vise versa. 

Futhermore, we can ignore or consider the corrected word. In the case bellow we have the same corrected word *having*. Its is also possible to have such case as:

A 11 12|||Wci|||having|||REQUIRED|||-NONE-|||0 <br/>
A 11 12|||Wci|||being|||REQUIRED|||-NONE-|||2

In this case we can consider stron comparison optionaly using *strong_with_corrected_value = True* parameter. 

Let's consider example for estimating general Kappa Cohen's coefficient between two annotators

In [21]:
df_0_1 = KappaCalculator.calculate_general_matrix_df(annotator_row=annotator_dict[0], annotator_col=annotator_dict[1])

In the below we can see matrix shows agreement between two annotators (annotator 0  - row, annotator 1 - column) based on error types

In [23]:
df_0_1

Unnamed: 0,Vt,Vm,V0,Vform,SVA,ArtOrDet,Nn,Npos,Pform,Pref,...,Ssub,WOinc,WOadv,Trans,Mec,Rloc-,Cit,Others,Um,noop
Vt,57,6,0,15,2,0,0,0,0,0,...,1,1,0,0,1,1,0,0,0,0
Vm,6,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
V0,0,1,8,4,0,0,0,0,0,0,...,0,0,0,0,0,0,0,2,0,0
Vform,2,0,0,21,2,0,0,0,0,0,...,3,0,1,0,0,1,0,0,0,0
SVA,2,0,0,1,50,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
ArtOrDet,1,0,0,0,3,168,0,0,2,7,...,0,0,0,2,1,2,0,0,0,0
Nn,0,0,0,0,0,1,94,0,0,0,...,0,0,0,0,2,0,0,0,0,0
Npos,0,0,0,0,0,0,1,5,0,0,...,1,0,0,0,0,0,0,0,0,0
Pform,0,0,0,0,0,1,0,0,5,3,...,0,0,0,0,0,0,0,0,0,0
Pref,0,0,0,0,0,5,0,0,12,18,...,4,0,0,0,0,0,0,0,0,0


The Cohen's Kappa coeficient between these two annotators:

In [24]:
k_01 = KappaCalculator.calculate_general_coeficient(annotator_row=annotator_dict[0], annotator_col=annotator_dict[1])
k_01

0.249346058228901

Let's calculate Kappa coeficient between each pair of annotators:

In [50]:
kappa_coeficients, avg_kappa = KappaCalculator.calculate_between_each_pair(annotator_dict)

In [51]:
kappa_coeficients

{(0, 1): 0.249346058228901,
 (0, 2): 0.5146254089054915,
 (0, 3): 0.6232399697199091,
 (0, 4): 0.6830601092896175,
 (1, 2): 0.4462189376651559,
 (1, 3): 0.5352687140115163,
 (1, 4): 0.6717325227963525,
 (2, 3): 0.579882547809065,
 (2, 4): 0.67816091954023,
 (3, 4): 0.6306990881458967}

The average inter-annotator agreement value based on each pairs:

In [52]:
avg_kappa

0.5612234276112135

Let's calculate Kappa coeficient between each pair of annotators, using strong compariosn for corrected value.

In [53]:
kappa_coeficients_strong, avg_kappa_strong = KappaCalculator.calculate_between_each_pair(annotator_dict, 
                                                                           strong_with_corrected_value=True)

In [54]:
kappa_coeficients_strong

{(0, 1): 0.18780630206466314,
 (0, 2): 0.48188065124615814,
 (0, 3): 0.5668255732886134,
 (0, 4): 0.6081081081081081,
 (1, 2): 0.4075259495564871,
 (1, 3): 0.4946739909242895,
 (1, 4): 0.5580357142857143,
 (2, 3): 0.5503441870629371,
 (2, 4): 0.5480225988700566,
 (3, 4): 0.5896656534954409}

In [55]:
avg_kappa_strong

0.4992888728902468

In [56]:
(avg_kappa - avg_kappa_strong) / avg_kappa

0.11035632454722062

Looking at the kappa coeficients between each annotators, we can consider that agreement with 4 annotators more than agreement with others. Probably, it observes because 4 annotator annotated significatly less sentences than others (as an example 4 annotator annotated only 6 sentences, while 0 annotated 1195 sentences. It means only sentences, we know that in each sentence there can be more than one correction done by any annotator).

And in case where two annotators annoted maximum sentences compared to other pairs, we have the smallest kappa coeficient between (0, 1): 0.25

Besides, we can notice that while using strong comparison included corrected value, we get less kappa coeficients than previous. There are no big difference, approximately 11%. It loggicaly explains. 

Here we focus on error types and calculate general and average inter-annotator agreement based on error types coincidence.  Of course, there are  more methods how can we calculate inter annotator agreement in this case. As an example,  we can focus on Identification (agreement of tagged tokens regardless of error category or correction), classification (agreement of error category, given identification) and xxact (agreement of error category and correction, given identification) like in this paper https://www.aclweb.org/anthology/W13-1703.pdf . 

### Cohen's Kappa coeficients for each error type

In this case we consider only two classes: specific error type and others. So for calculating Cohen's Kappa we consider the sample example from https://en.wikipedia.org/wiki/Cohen%27s_kappa, where 'Yes' is specific error type and 'No' - others.

Let's calculate Cohen's Kappa coeficient for each type and for each pair

In [94]:
err_types = list(ErrorType)

In [104]:
result = dict()
indices = list()
ids = list(annotator_dict.keys())
for i in range(0, len(annotator_dict) - 1):
    for j in range(i+1, len(ids)):
        id1, id2 = ids[i], ids[j]
        indices.append(str((id1, id2)))
        for err_type in err_types:
            kappa = KappaCalculator.calculate_coefficient_for_err_type(annotator_row=annotator_dict[id1], 
                                               annotator_col=annotator_dict[id2],
                                                error_type=err_type)
            if err_type not in result:
                result[err_type] = []
                
            result[err_type].append(kappa)
             

kappa_err_types_df = pd.DataFrame(result, index=indices)
kappa_err_types_df.columns = list(ErrorTypeMap.keys())

In [105]:
kappa_err_types_df.columns

Index(['Vt', 'Vm', 'V0', 'Vform', 'SVA', 'ArtOrDet', 'Nn', 'Npos', 'Pform',
       'Pref', 'Prep', 'Wci', 'Wa', 'Wform', 'Wtone', 'Srun', 'Smod', 'Spar',
       'Sfrag', 'Ssub', 'WOinc', 'WOadv', 'Trans', 'Mec', 'Rloc-', 'Cit',
       'Others', 'Um', 'noop'],
      dtype='object')

In [106]:
kappa_err_types_df[['Vt', 'Vm', 'V0', 'Vform', 'SVA', 'ArtOrDet', 'Nn', 'Npos', 'Pform', 'Pref']]

Unnamed: 0,Vt,Vm,V0,Vform,SVA,ArtOrDet,Nn,Npos,Pform,Pref
"(0, 1)",0.726089,0.947966,0.820817,0.86358,0.669992,0.634672,0.671418,0.806918,0.908551,0.862649
"(0, 2)",0.609061,0.66149,0.644178,0.644746,0.544384,0.587497,0.607132,0.640285,0.651199,0.651593
"(0, 3)",0.586086,0.605132,0.595908,0.593621,0.57117,0.56094,0.616593,0.547062,0.572819,0.678164
"(0, 4)",1.0,,,0.56391,0.482143,0.406143,0.482143,,,
"(1, 2)",0.70371,0.812986,0.784888,0.725896,0.667045,0.635551,0.615558,0.747946,0.881558,0.812019
"(1, 3)",0.690014,0.907511,0.596752,0.619482,0.541025,0.629395,0.607275,0.620221,1.0,0.675263
"(1, 4)",0.649351,,,0.46,0.480769,0.509091,0.480769,,,
"(2, 3)",0.612891,0.60671,0.596529,0.574262,0.574118,0.578116,0.650911,0.708312,0.661818,0.634434
"(2, 4)",1.0,,,0.555556,0.478261,0.428571,0.478261,,,
"(3, 4)",1.0,,,0.4375,0.480769,0.564516,0.480769,,,


In [107]:
kappa_err_types_df[['Prep', 'Wci', 'Wa', 'Wform', 'Wtone', 'Srun', 'Smod', 'Spar', 'Sfrag', 'Ssub']]

Unnamed: 0,Prep,Wci,Wa,Wform,Wtone,Srun,Smod,Spar,Sfrag,Ssub
"(0, 1)",0.623043,0.798439,,0.748311,1.0,0.845763,1.0,1.0,1.0,0.929273
"(0, 2)",0.606456,0.629957,,0.613517,0.553486,1.0,,0.799498,0.665737,0.628475
"(0, 3)",0.645846,0.541817,,0.632216,,0.497881,,0.497881,1.0,0.49359
"(0, 4)",0.42,0.369565,,,,,,,,
"(1, 2)",0.609863,0.638168,0.499592,0.665531,0.874821,0.685739,0.666304,0.768359,1.0,0.908956
"(1, 3)",0.562715,0.568806,1.0,0.614821,,0.831828,,1.0,,1.0
"(1, 4)",0.357143,0.560976,,,,,,1.0,,
"(2, 3)",0.537313,0.649001,1.0,0.575342,1.0,1.0,,0.748649,1.0,0.494565
"(2, 4)",0.368421,0.454545,,,,,,1.0,,
"(3, 4)",0.386364,0.4375,,,,,,,,


In [108]:
kappa_err_types_df[['WOinc', 'WOadv', 'Trans', 'Mec', 'Rloc-', 'Cit', 'Others', 'Um', 'noop']]

Unnamed: 0,WOinc,WOadv,Trans,Mec,Rloc-,Cit,Others,Um,noop
"(0, 1)",0.86605,0.785317,0.855907,0.743193,0.928731,,0.983007,1.0,1.0
"(0, 2)",0.69032,0.665737,0.634618,0.626024,0.695227,,0.948979,0.713432,1.0
"(0, 3)",0.664781,,0.587467,0.531678,0.618818,,0.914748,1.0,1.0
"(0, 4)",,,1.0,,0.736364,,,,1.0
"(1, 2)",0.776074,0.784712,0.67835,0.629182,0.778047,,0.74629,0.76008,1.0
"(1, 3)",0.71152,1.0,0.89857,0.574426,0.658684,,0.66162,0.72322,1.0
"(1, 4)",1.0,,0.480769,0.630137,,,,1.0,1.0
"(2, 3)",0.665066,,0.815293,0.533694,0.663043,,0.795704,0.66345,1.0
"(2, 4)",,,0.478261,0.647059,0.733333,,,,1.0
"(3, 4)",1.0,,1.0,0.480769,0.735294,,1.0,,1.0


From the tables above we can notice inter-annotation agreement for each error types.

We can observe any intereting behaviours, as an example there were not mention error type Cit in available corpus or annotator always agreed while about *noop* (no errors in sentence). Also, is is possible to notice that some annotators (0, 1, 2, 3)  mentioned error *Wtone* and some (4) did not. 

Let's calculate average Kappa for each error types

In [116]:
def avg(arr):
    sum, size = 0, 0 
    for i in range(0, len(arr)):
        if arr[i] is None:
            continue
        sum += arr[i]
        size += 1
    if size == 0:
        return None
    return sum / size

In [130]:
avg_dict = dict()
min_key, min_val = None, 1
for key in result:
    avg_dict[key] = avg(result[key])
    if avg_dict[key] is not None:
        if min_val >= avg_dict[key]:
            min_val = avg_dict[key]
            min_key = key

In [121]:
avg_dict

{<ErrorType.Vt: 0>: 0.7577201648875695,
 <ErrorType.Vm: 1>: 0.7569655581461495,
 <ErrorType.V0: 2>: 0.6731786820310733,
 <ErrorType.Vform: 3>: 0.603855342316593,
 <ErrorType.SVA: 4>: 0.5489675872391943,
 <ErrorType.ArtOrDet: 5>: 0.5534493618306153,
 <ErrorType.Nn: 6>: 0.5690828393518952,
 <ErrorType.Npos: 7>: 0.6784570958637944,
 <ErrorType.Pform: 8>: 0.7793241604757709,
 <ErrorType.Pref: 9>: 0.7190203106162646,
 <ErrorType.Prep: 10>: 0.5117164465244398,
 <ErrorType.Wci: 11>: 0.5648774938636414,
 <ErrorType.Wa: 12>: 0.8331973898858157,
 <ErrorType.Wform: 13>: 0.6416230663666137,
 <ErrorType.Wtone: 14>: 0.8570769178755788,
 <ErrorType.Srun: 15>: 0.8102019973091492,
 <ErrorType.Smod: 16>: 0.8331520261082437,
 <ErrorType.Spar: 17>: 0.8517984595247624,
 <ErrorType.Sfrag: 18>: 0.9331474381317527,
 <ErrorType.Ssub: 19>: 0.742476382315367,
 <ErrorType.WOinc: 20>: 0.7967264893764613,
 <ErrorType.WOadv: 21>: 0.8089415615855151,
 <ErrorType.Trans: 22>: 0.7429236460976408,
 <ErrorType.Mec: 23>: 0

The error type with minimum agreement: 

In [133]:
min_key

<ErrorType.Prep: 10>

In [134]:
avg_dict[min_key]

0.5117164465244398