In [784]:
import pandas as pd
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
from difflib import SequenceMatcher 

In [785]:
filepath = "/Users/liudmylaslava/Downloads/test.txt"
with open(filepath) as fp:
    content = fp.read()
content[:1000]

'S Keeping the Secret of Genetic Testing\n\nS What is genetic risk ?\n\nS Genetic risk refers more to your chance of inheriting a disorder or disease .\nA 3 4|||ArtOrDet||||||REQUIRED|||-NONE-|||0\n\nS People get certain disease because of genetic changes .\nA 3 4|||Nn|||diseases|||REQUIRED|||-NONE-|||0\n\nS How much a genetic change tells us about your chance of developing a disorder is not always clear .\n\nS If your genetic results indicate that you have gene changes associated with an increased risk of heart disease , it does not mean that you definitely will develop heart disease .\nA -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\nA 25 27|||WOadv|||will definitely|||REQUIRED|||-NONE-|||1\n\nS The opposite is also true .\n\nS If your genetic results show that you do not have changes associated with an increased risk of heart disease , it is still possible that you develop heart disease .\nA -1 -1|||noop|||-NONE-|||REQUIRED|||-NONE-|||0\nA 26 26|||Vt|||will|||REQUIRED|||-NONE-|||1\n

In [786]:
text = content.split('\n\n')
data = []
for t in text:
    if '|||' in t:
        chunk = t.split('\n')
        for c in chunk[1:]:
            tr = c+'|||'+chunk[0]
            res = tr.split('|||')
            data.append(res)
df = pd.DataFrame(data = data, columns=['position',  'error_type', 'edit', 'req', 'no', 'ann_id', 'sent'])
df = df.loc[~df['error_type'].str.startswith('S Social media')]
df.head()

Unnamed: 0,position,error_type,edit,req,no,ann_id,sent
0,A 3 4,ArtOrDet,,REQUIRED,-NONE-,0,S Genetic risk refers more to your chance of i...
1,A 3 4,Nn,diseases,REQUIRED,-NONE-,0,S People get certain disease because of geneti...
2,A -1 -1,noop,-NONE-,REQUIRED,-NONE-,0,S If your genetic results indicate that you ha...
3,A 25 27,WOadv,will definitely,REQUIRED,-NONE-,1,S If your genetic results indicate that you ha...
4,A -1 -1,noop,-NONE-,REQUIRED,-NONE-,0,S If your genetic results show that you do not...


In [787]:
# Start with 0th and 1th annotators
df_chunk = df[df['ann_id'].isin(['1','0'])]
# how many annotations we have for each sentence 
max_ann = df_chunk.groupby(['sent', 'ann_id']).size().reset_index(name='counts').groupby(['sent'])['counts'].max().reset_index(name='max_ann')
df_chunk = pd.merge(df_chunk, max_ann, on='sent', how='left')
# Calculate total number of questions with 0th and 1th annotators
total = df_chunk.groupby(['sent'], as_index=False)['ann_id'].nunique().value_counts()[2]
print(f'Total number of qustions for 1th and 2th ann: {total}')
# cases where annotators agreed
full = df_chunk[df_chunk[['position', 'edit', 'sent']].duplicated(keep='first')]
result = sum(1/full['max_ann'])
print(f'Cases where they agreed: {result}')

Total number of qustions for 1th and 2th ann: 1128
Cases where they agreed: 213.6834596695644


In [788]:
# Deal with cases where position the same but edition is not
rest = df_chunk[~df_chunk[['position', 'edit', 'sent']].duplicated(keep=False)]
part = rest[rest[['position', 'sent']].duplicated(keep=False)]
p = part.groupby(['sent','position', 'max_ann'])['edit'].apply(lambda x: ','.join(x)).reset_index()
# split edit column into two columns 
p[['first', 'second']] = p['edit'].str.split(',', n=1, expand=True)
p.head()

Unnamed: 0,sent,position,max_ann,edit,first,second
0,S A research shows that people will unconsciou...,A 8 11,5,",their",,their
1,S A study of New York University in 2010 shown...,A 8 9,4,"shows,showed",shows,showed
2,"S Above all , life is more important than secr...",A 8 9,1,"secrets,secrecy",secrets,secrecy
3,S Advantages : Social media sites have greatly...,A 7 8,2,"eased,facilitated",eased,facilitated
4,S After realizing the he or she is tshe carrie...,A 1 2,3,"discovering,he or she realizes",discovering,he or she realizes


In [789]:
def longestSubstring(str1,str2):
    
    seqMatch = SequenceMatcher(None,str1,str2) 
  
    # find match of longest sub-string 
    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
    # substring normalazing by dividing by longer string
    if len(str1)>=len(str2):
        res = match.size/len(str1)
    else:
        res = match.size/len(str2)
  
    return res

In [790]:
p['sim'] = p.loc[:, ["first",'second']].apply(lambda x: longestSubstring(*x), axis=1)
p.head()

Unnamed: 0,sent,position,max_ann,edit,first,second,sim
0,S A research shows that people will unconsciou...,A 8 11,5,",their",,their,0.0
1,S A study of New York University in 2010 shown...,A 8 9,4,"shows,showed",shows,showed,0.666667
2,"S Above all , life is more important than secr...",A 8 9,1,"secrets,secrecy",secrets,secrecy,0.714286
3,S Advantages : Social media sites have greatly...,A 7 8,2,"eased,facilitated",eased,facilitated,0.181818
4,S After realizing the he or she is tshe carrie...,A 1 2,3,"discovering,he or she realizes",discovering,he or she realizes,0.055556


In [791]:
part_res = sum(p['sim']/p['max_ann'])
final = (result+part_res)/total
print(f"Annotator agreement between 0th and 1th = {final}")

Annotator agreement between 0th and 1th = 0.20664969207929745


In [792]:
def inner_agreement(str1, str2, df):
    df_chunk = df[df['ann_id'].isin([str1, str2])]
    max_ann = df_chunk.groupby(['sent', 'ann_id']).size().reset_index(name='counts').groupby(['sent'])['counts'].max().reset_index(name='max_ann')
    df_chunk = pd.merge(df_chunk, max_ann, on='sent', how='left')
    total = df_chunk.groupby(['sent'], as_index=False)['ann_id'].nunique().value_counts()[2]
    full = df_chunk[df_chunk[['position', 'edit', 'sent']].duplicated(keep='first')]
    result = sum(1/full['max_ann'])
    rest = df_chunk[~df_chunk[['position', 'edit', 'sent']].duplicated(keep=False)]
    part = rest[rest[['position', 'sent']].duplicated(keep=False)]
    if len(part)!=0:
        p = part.groupby(['sent','position', 'max_ann'])['edit'].apply(lambda x: ','.join(x)).reset_index()
        p[['first', 'second']] = p['edit'].str.split(',', n=1, expand=True)
        p['sim'] = p.loc[:, ["first",'second']].apply(lambda x: longestSubstring(*x), axis=1)
        part_res = sum(p['sim']/p['max_ann'])
    else:
        part_res = 0
    final = (result+part_res)/total
    return final

In [793]:
# В цілому досить низька згода анотувальників
# Багато виправлень не збігається, бо у кожного анотувальника було своє уявлення як позначати виправлення 

# do the same for all pairs 
annot = [str(i) for i in list(range(0, 5))]
pairs = [[annot[a1], annot[a2]] for a1 in range(len(annot)) for a2 in range(a1+1,len(annot))]
res = 0
for pair in pairs:
    print(f'For pair: {pair} argeement = {inner_agreement(pair[0], pair[1], df)}')
    res += inner_agreement(pair[0], pair[1], df)
print(f'Total inter annotator agreement = {res/10}')

For pair: ['0', '1'] argeement = 0.20664969207929745
For pair: ['0', '2'] argeement = 0.2976265796241484
For pair: ['0', '3'] argeement = 0.37698635083667476
For pair: ['0', '4'] argeement = 0.5554843304843307
For pair: ['1', '2'] argeement = 0.36711648458266993
For pair: ['1', '3'] argeement = 0.4476024335695322
For pair: ['1', '4'] argeement = 0.4597222222222223
For pair: ['2', '3'] argeement = 0.4142490199693589
For pair: ['2', '4'] argeement = 0.39014041514041514
For pair: ['3', '4'] argeement = 0.42685185185185204
Total inter annotator agreement = 0.3942429380360502


In [794]:
############################################

In [795]:
def inter_error_agreement(str1, str2, df):
    df_chunk = df[df['ann_id'].isin([str1, str2])]
    match = df_chunk[df_chunk[['position', 'error_type','edit', 'sent']].duplicated(keep=False)]
    g_match = match.groupby(['sent', 'error_type'])['position'].nunique().reset_index(name='match')
    rest = df_chunk[~df_chunk[['position', 'error_type','edit', 'sent']].duplicated(keep=False)]
    part = rest[rest[['position', 'error_type', 'sent']].duplicated(keep=False)]
    p = part.groupby(['sent','position', 'error_type'])['edit'].apply(lambda x: ','.join(x)).reset_index()
    p[['first', 'second']] = p['edit'].str.split(',', n=1, expand=True)
    p['sim'] = p.loc[:, ["first",'second']].apply(lambda x: longestSubstring(*x), axis=1)
    p = p[['sent','error_type','sim']]
    not_match = rest[~rest.isin(part)].dropna()
    g_not_match=not_match.groupby(['sent', 'error_type']).size().reset_index(name='not_match')
    r = pd.merge(g_match, g_not_match, how='outer', on=['sent','error_type']).head(60)
    result = pd.merge(r, p, how='outer', on=['sent','error_type'])
    result.fillna(0, inplace=True)
    result['sim_flag']=result.sim.apply(lambda x: 1 if x > 0 else 0)
    result['res'] = (result['match']+result['sim'])/(result['match']+result['not_match']+result['sim_flag'])
    result.fillna(0, inplace=True)
    total = []
    for error in error_types:
        df_error = result[result['error_type']==error]
        if len(df_error)!=0:
            res = [error, sum(df_error['res'])/len(df_error)]
        else:
            res = [error, 'no']
        total.append(res)
    df_total = pd.DataFrame(data=total, columns = ['error_type', 'agr'])
    return df_total

In [796]:
data = []
for pair in pairs:
    result = inter_error_agreement(pair[0], pair[1], df)
    data.append(result)
data = pd.concat(data)
data = data[data['agr']!='no']
agr = data.groupby('error_type', as_index=False).agg({'agr':'sum'})
num = data.groupby('error_type', as_index=False).agg({'agr':'size'})
final = pd.merge(agr, num, on='error_type')
final['agreement'] = final['agr_x']/final['agr_y']
final = final[['error_type', 'agreement']]
final

Unnamed: 0,error_type,agreement
0,ArtOrDet,0.431085
1,Mec,0.507845
2,Nn,0.56956
3,Npos,0.714286
4,Others,0.177493
5,Pform,0.65
6,Pref,0.497608
7,Prep,0.506874
8,Rloc-,0.613889
9,SVA,0.681429
