# Visualization and annotation

This notebook shows all steps for helping the annotator in the labelling task.

In [1]:
import numpy as np
import pandas as pd
import sys
import re
import xml.sax
import time
import datetime
from colorama import init, Fore, Back, Style
from time import time, gmtime, strftime
from datetime import datetime
from tqdm import tqdm
from sklearn.metrics import cohen_kappa_score

# can display several outputs from a single code cell
from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

# change display of pandas df
#pd.set_option('display.height', 1000)
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 500)
#pd.set_option('display.width', 1000)

(OPTIONAL)<br>
Disable scrolling output window mode (useful for display long list of results)

In [2]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

(OPTIONAL)<br>
Resume scrolling output window mode

In [3]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return true;
}

<IPython.core.display.Javascript object>

## Dataframe visualizer

Load datasets (one .tsv file usually contains 100 samples or version pairs).

In [4]:
sample_tsv_file = '00_09_73_sample.tsv'
df_sample = pd.read_csv('/media/hdd/salaun/wikiedit/annotations/data_v1/' + sample_tsv_file, sep='\t')

#df_sample = df_sample.drop('Unnamed: 0', 1)
df_sample = df_sample.fillna('')
df_sample.shape
df_sample.columns
df_sample.head()

(100, 35)

Index(['Unnamed: 0', 'sample_id', 'id_file', 'id_modif', 'label_incoherence',
       'username', 'user_id', 'registered', 'format', 'model', 'id',
       'parentid', 'title', 'minor', 'comment', 'modif', 'modif_remove',
       'modif_add', 'timestamp', 'filt_bot', 'filt_coher', 'filt_confli',
       'filt_contradic', 'filt_erreur', 'filt_erron', 'filt_faux',
       'filt_frappe', 'filt_gramma', 'filt_ortho', 'filt_revert', 'filt_sens',
       'filt_tromp', 'filt_typo', 'filt_vandalisme', 'filt_vraise'],
      dtype='object')

Unnamed: 0.1,Unnamed: 0,sample_id,id_file,id_modif,label_incoherence,username,user_id,registered,format,model,id,parentid,title,minor,comment,modif,modif_remove,modif_add,timestamp,filt_bot,filt_coher,filt_confli,filt_contradic,filt_erreur,filt_erron,filt_faux,filt_frappe,filt_gramma,filt_ortho,filt_revert,filt_sens,filt_tromp,filt_typo,filt_vandalisme,filt_vraise
0,0,p3p3661_181737,p3p3661,181737,False,196.28.255.210,,False,text/x-wiki,wikitext,23653486,23640882,Burkina Faso,False,/* Administration */,- Le territoire du Burkina Faso est réparti en...,Le territoire du Burkina Faso est réparti en 4...,Le territoire du Burkina Faso est réparti en 4...,2007-12-04T10:39:19Z,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
1,1,p3p3661_1115892,p3p3661,1115892,False,Skiff,242536.0,True,text/x-wiki,wikitext,124685050,124685017,Soudan,False,/* {{s-|XX}} */,- En septembre [[1983]] le président Nimeiri a...,En septembre [[1983]] le président Nimeiri ann...,"En septembre [[1983]], le président Nimeiri an...",2016-03-25T04:39:21Z,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
2,2,p3p3661_248902,p3p3661,248902,False,178.194.81.42,,False,text/x-wiki,wikitext,106009920,105780947,Château de Pierrefonds,False,/* Un monument en constante transformation */,- Ouverts au public sous le Second Empire comm...,Ouverts au public sous le Second Empire comme ...,Ouverts au public sous le Second Empire comme ...,2014-08-05T11:28:00Z,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
3,3,p3p3661_931556,p3p3661,931556,False,Zedh,132668.0,True,text/x-wiki,wikitext,17836113,17807973,Paire torsadée,False,/* Catégorie 5e */ corr baclée (je repasserai),+ La catégorie 5e (''enhanced'') est un type d...,La catégorie 5e (''enhanced'') est un type de ...,La catégorie 5e (''enhanced'') est un type de ...,2007-06-12T16:03:04Z,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
4,4,p3p3661_494479,p3p3661,494479,False,Authueil,142917.0,True,text/x-wiki,wikitext,135511369,135511330,Guinée,False,Annulation de la [[Spécial:Diff/135511330|modi...,- Le mil et le fonio sont les principales cult...,Le mil et le fonio sont les principales cultur...,Le mil et le fonio sont les principales cultur...,2017-03-17T18:57:49Z,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


Display all relevant information from <code>df_sample</code> entries with a reader-friendly interface. One can comment/uncomment elements depending on what features she needs. Usually one displays urls (<code>link</code>) for a better visualization of Wikipedia edits.
For annotation task, process .csv files in **InterfaceAnnotation.exe** that will return eventually return a .csv file with all labels. That .csv file name shall contain the substring *post_annot* .

In [5]:
for row in range(0, len(df_sample)):

    print('sample_id', '\t', df_sample.loc[row, 'sample_id'])
    print(Style.BRIGHT + Fore.WHITE + 'timestamp', '\t', df_sample.loc[row, 'timestamp'] + Style.RESET_ALL)
    print('\n') 

    #print('minor_modif', '\t', df_sample.loc[row, 'minor'])
    print(Style.BRIGHT + Fore.RED + 'username', '\t', df_sample.loc[row, 'username'] + Style.RESET_ALL) 
    
    if df_sample.loc[row, 'registered']:
        print('\t','\t', Style.BRIGHT + Fore.WHITE + '>>> registered user' + Style.RESET_ALL)
    else:
        print('\t','\t', Style.BRIGHT + Fore.WHITE + '>>> NON registered user' + Style.RESET_ALL, '\n')

    
    if df_sample.loc[row, 'minor']:
        print('\n','\t','\t', Style.BRIGHT + Fore.WHITE + '# MINOR MODIF' + Style.RESET_ALL)
    else:
        pass
    
    print(Style.BRIGHT + Fore.RED + 'comment', '\t', df_sample.loc[row, 'comment'] + Style.RESET_ALL, '\n')
    
    print('title   ' , '\t', Style.BRIGHT + Fore.BLUE + df_sample.loc[row, 'title'] + Style.RESET_ALL)
    print('link     ', '\t', 'https://fr.wikipedia.org/w/index.php?diff=' + str(df_sample.loc[row, 'id']) + '&oldid=' + str(df_sample.loc[row, 'parentid']), '\n')
    
    # display text versions
    #print(df_sample.loc[row, 'modif'] + '\n' + '_'*80)
    #print(df_sample.loc[row, 'modif_remove'] + '\n' + '_'*80)
    #print(df_sample.loc[row, 'modif_add'] + '\n' + '_'*80)

    print('\n'*2)
    print('_'*83)
    print('_'*35 + 'Remaining items: ' + str(len(df_sample) - 1 - row) + '_'*35)
    print('_'*83)
    print('\n'*2)

sample_id 	 p3p3661_181737
[1m[37mtimestamp 	 2007-12-04T10:39:19Z[0m


[1m[31musername 	 196.28.255.210[0m
	 	 [1m[37m>>> NON registered user[0m 

[1m[31mcomment 	 /* Administration */[0m 

title    	 [1m[34mBurkina Faso[0m
link      	 https://fr.wikipedia.org/w/index.php?diff=23653486&oldid=23640882 




___________________________________________________________________________________
___________________________________Remaining items: 99___________________________________
___________________________________________________________________________________



sample_id 	 p3p3661_1115892
[1m[37mtimestamp 	 2016-03-25T04:39:21Z[0m


[1m[31musername 	 Skiff[0m
	 	 [1m[37m>>> registered user[0m
[1m[31mcomment 	 /* {{s-|XX}} */[0m 

title    	 [1m[34mSoudan[0m
link      	 https://fr.wikipedia.org/w/index.php?diff=124685050&oldid=124685017 




___________________________________________________________________________________
________________________________

## Annotations retrieval

Once the *post_annot* outputfile is generated by **InterfaceAnnotation.exe**, it is loaded as a pandas dataframe in which the last column contains all labels applied to the sample.

In [6]:
def csv_extractor(path, filename):
    df = pd.read_csv(path + filename, sep=';', header=None)
    df.columns = ['index', 'sample_id', 'raw_annot']
    print('Shape:', df.shape)
    return df

In [7]:
raw_annot = 'data_v1/old_10_19/10_19_post_annot_O_old.csv'
path = '/media/hdd/salaun/wikiedit/annotations/'
df_post_annot = csv_extractor(path, raw_annot)
df_post_annot.tail()

Shape: (100, 3)


Unnamed: 0,index,sample_id,raw_annot
95,95,p356266p389955_596936,ortho_gram_typo
96,96,p356266p389955_811883,new_content
97,97,p356266p389955_529934,ortho_gram_typo
98,98,p356266p389955_1120258,update_content
99,99,p356266p389955_386533,wiki_formatting


Distinct columns are made for each of labels contained in **raw_annot** column.

In [8]:
labels_list = ['ortho_gram_typo', 'wiki_formatting', 'vandal', 'reordering', 'revert_vandal', 'revert_other', 'content_remove', 'content_add', 'nbr_wr', 'nbr_rw', 'nbr_x', 'semant_simil', 'semant_diff', 'other']

def label_extractor(df):
    print('Shape of df_post_annot:', df.shape)
    
    for label in labels_list:
        df[label] = df.raw_annot.str.contains(label)
        
    for label in ['nbr', 'semant']:
        df[label] = df.raw_annot.str.contains(label)
    
    for row in range(len(df)):
        if df.loc[row, 'nbr'] or df.loc[row, 'semant']:
            df.loc[row, 'nbr_or_semant'] = True
        else:
            df.loc[row, 'nbr_or_semant'] = False
       
    return df

In [9]:
df_post_annot = label_extractor(df_post_annot)
df_post_annot.tail()

Shape of df_post_annot: (100, 3)


Unnamed: 0,index,sample_id,raw_annot,ortho_gram_typo,wiki_formatting,vandal,reordering,revert_vandal,revert_other,content_remove,content_add,nbr_wr,nbr_rw,nbr_x,semant_simil,semant_diff,other,nbr,semant,nbr_or_semant
95,95,p356266p389955_596936,ortho_gram_typo,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
96,96,p356266p389955_811883,new_content,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
97,97,p356266p389955_529934,ortho_gram_typo,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
98,98,p356266p389955_1120258,update_content,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False
99,99,p356266p389955_386533,wiki_formatting,False,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False


### Summary statistics for all labels

In [10]:
labels_list_expand = labels_list + ['nbr', 'semant', 'nbr_or_semant']

def labels_sum_stats(df):
    
    df_sum_stats = pd.DataFrame(index=labels_list_expand, columns=['Absolute number', 'Share of all observations'])
    
    for label in labels_list_expand:
        df_sum_stats.loc[label, 'Absolute number'] = df[label].sum()
        df_sum_stats.loc[label, 'Share of all observations'] = df[label].sum() / len(df)
        
    return df_sum_stats

In [11]:
labels_sum_stats(df_post_annot)

Unnamed: 0,Absolute number,Share of all observations
ortho_gram_typo,24,0.24
wiki_formatting,27,0.27
vandal,10,0.1
reordering,7,0.07
revert_vandal,6,0.06
revert_other,0,0.0
content_remove,0,0.0
content_add,0,0.0
nbr_wr,2,0.02
nbr_rw,1,0.01


### Inter-annotators agreement

In [12]:
raw_annotO = 'data_v1/old_10_19/10_19_post_annot_O_old.csv'
raw_annotS = 'data_v1/old_10_19/10_19_post_annot_S_old.csv'

df_annot_O = csv_extractor(path, raw_annotO)
df_annot_S = csv_extractor(path, raw_annotS)

df_annot_O = label_extractor(df_annot_O)
df_annot_S = label_extractor(df_annot_S)

Shape: (100, 3)
Shape: (100, 3)
Shape of df_post_annot: (100, 3)
Shape of df_post_annot: (100, 3)


In [14]:
def cohen_kappa_score_calculator(df1, df2):
    mat1 = df1.loc[:,'ortho_gram_typo':'nbr_or_semant'].as_matrix().astype(np.int)
    mat2 = df2.loc[:,'ortho_gram_typo':'nbr_or_semant'].as_matrix().astype(np.int)
    
    df_kappa_scores = pd.DataFrame(index=labels_list_expand, columns=['kappa score'])
    cohen_kappa_score_array = np.array([])
    
    for i in range(len(labels_list_expand)):
        df_kappa_scores.loc[labels_list_expand[i],'kappa score'] = '{:.2f}'.format(cohen_kappa_score(mat1[:,i], mat2[:,i]))
        cohen_kappa_score_array = np.append(cohen_kappa_score_array, cohen_kappa_score(mat1[:,i], mat2[:,i]))
        
    print("Gross average: %0.2f (+/- %0.2f)" % (cohen_kappa_score_array[~np.isnan(cohen_kappa_score_array)].mean(), cohen_kappa_score_array[~np.isnan(cohen_kappa_score_array)].std()))
    
    return df_kappa_scores

In [15]:
cohen_kappa_score_calculator(df_annot_O, df_annot_S)

Gross average: 0.32 (+/- 0.30)


  k = np.sum(w_mat * confusion) / np.sum(w_mat * expected)


Unnamed: 0,kappa score
ortho_gram_typo,0.4
wiki_formatting,0.41
vandal,0.39
reordering,0.65
revert_vandal,0.0
revert_other,0.0
content_remove,
content_add,
nbr_wr,0.8
nbr_rw,0.0
