In [1]:
import numpy as np
import pandas as pd
import difflib
import sys
import re
import xml.sax
from time import time, gmtime, strftime
from datetime import datetime
from xml.sax.handler import ContentHandler
from hyperdash import monitor_cell
from tqdm import tqdm
from IPython.core.interactiveshell import InteractiveShell # no need of print for several objects!!!
InteractiveShell.ast_node_interactivity = "all"

In [2]:
sys.setrecursionlimit(10000) # shift recursion max limit for allowing extensive pairwise comparisons
d = difflib.Differ()
# https://svn.python.org/projects/python/branches/release27-maint/Lib/difflib.py
def comparator(text, prev_text):
    diff = d.compare(text.splitlines(), prev_text.splitlines())
    return '\n'.join(diff)

In [3]:
#### BUILDS ONE PAIR OF TEXT VERSIONS FOR EACH ROW OF THE DATAFRAME

# http://www.heatonresearch.com/2017/03/03/python-basic-wikipedia-parsing.html
# https://www.tutorialspoint.com/python/python_xml_processing.htm
# example from movie handler

list_title = []
list_timestamp = []
list_username = []
list_comment = []
list_page = []
i = 0 # initialize pair id
j = 0 # initialize number of revisions
iteration = 20000 # print number of processed pairs every iteration (integer)
init = True # specify that the first entry is the first element of the first pair
list_dic = []
list_seman = ['cohér', 'contradic', 'faux', 'vraise', 'erron', 'erreur', 'trompe', 'confli', 'sens' ]
# list of tags for detecting potential semantic-related edits with key terms in the comments
list_synta = ['ortho', 'gramma', 'typo', 'frappe']
# list of tags for detecting potential spelling-related edits with key terms in the comments

class WikiEditHandler(xml.sax.ContentHandler):
    def __init__(self):
        print('Start time:', str(datetime.now()))
        self.title = ""
        self.timestamp = ""
        self.username = ""
        self.registered = True
        self.user_id = ""
        self.comment = ""
        self.text = "" # needed for texts comparison
        self.id = ""
        self.parentid = ""
        self.inside_contributor = False
        self.format = ""
        self.model = ""
        self.modif_str = ""
        self.modif_list1 = []
        self.modif_list2 = []
        self.modif_remove = []
        self.modif_add = []
        self.modif = ""
        self.minor = False
        
        self.prev_title = ""
        self.prev_timestamp = ""
        self.prev_username = ""
        self.prev_comment = ""
        self.prev_text = "" # needed for texts comparison
        self.prev_modif = ""
        
        self.filt_coher = False
        self.filt_contradic = False
        self.filt_faux = False
        self.filt_vraise = False
        self.filt_erron = False
        
        self.filt_erreur = False
        self.filt_tromp = False
        self.filt_confli = False
        self.filt_sens = False
        self.filt_ortho = False
        
        self.filt_gramma = False
        self.filt_typo = False
        self.filt_frappe = False
        self.filt_revert = False
        self.filt_vandalisme = False
        
        self.filt_bot = False
        
        self.label_incoherence = False # by default 0
        
    # Call when an element starts
    def startElement(self, tag, attributes):
        #print('startElement')
        self.CurrentData = tag
        if tag == "contributor":  # avoid taking id from user, take revision id
            self.inside_contributor = True
            
    # Call when an elements ends
    def endElement(self, tag):
        if tag == "contributor":
            self.inside_contributor = False
        if tag == "revision":
            global i # http://eli.thegreenplace.net/2011/05/15/understanding-unboundlocalerror-in-python
            global j
            global iteration
            global prev_i
            global df
            global list_dic
            global init
            
            if j % iteration == 0:
                print('Number of revisions checked: {:,d}'.format(j))
                #print('List length: {:,d}'.format(len(list_dic)))
            j+=1
            
            # reset all filters to False
            self.filt_coher = False
            self.filt_contradic = False
            self.filt_faux = False
            self.filt_vraise = False
            self.filt_erron = False
            
            self.filt_erreur = False
            self.filt_tromp = False
            self.filt_confli = False
            self.filt_sens = False
            self.filt_ortho = False
            
            self.filt_gramma = False
            self.filt_typo = False
            self.filt_frappe = False
            self.filt_revert = False
            self.filt_vandalisme = False
            
            self.flit_bot = False
            
            # make binary True if a substring is found in comment
            if 'cohér' in self.comment.lower() or 'coher' in self.comment.lower():
                self.filt_coher = True
            if 'contradic' in self.comment.lower():
                self.filt_contradic = True
            if 'faux' in self.comment.lower():
                self.filt_faux = True
            if 'vraise' in self.comment.lower():
                self.filt_vraise = True
            if 'erron' in self.comment.lower():
                self.filt_erron = True
                
            if 'erreur' in self.comment.lower():
                self.filt_erreur = True
            if 'trompe' in self.comment.lower():
                self.filt_tromp = True
            if 'confli' in self.comment.lower():
                self.filt_confli = True
            if 'sens'   in self.comment.lower():
                self.filt_sens = True
            if 'ortho' in self.comment.lower():
                self.filt_ortho = True
                
            if 'gramma' in self.comment.lower():
                self.filt_gramma = True
            if 'typo' in self.comment.lower():
                self.filt_typo = True
            if 'frappe' in self.comment.lower():
                self.filt_frappe = True
            if 'revert' in self.comment.lower() or 'revoc' in self.comment.lower() or 'révoc' in self.comment.lower():
                self.filt_revert = True
            if 'vandalisme' in self.comment.lower():
                self.filt_vandalisme = True
                
            if 'bot ' in self.comment.lower():
                self.filt_bot = True
    
            if init == False :#and any(s in self.comment for s in list_seman) :
            # keep comments that match a substring
                
                self.modif_list1 = comparator(self.prev_text, self.text).split('\n')

                try: # split strings from modif between modif_remove and modif_add
                    for e in self.modif_list1:
                        if e.startswith(("+ ", "- ", "? ")):
                            self.modif_list2.append(e + '\n')
                        if e.startswith(("- ")):
                            self.modif_remove.append(e[2:] + '\n')
                        if e.startswith(("+ ")):
                            self.modif_add.append(e[2:] + '\n')
                except:
                    print(self.modif_list2[-1])
                
                list_dic += [{'id_file': file,
                            'id_modif': i,
                            'title': self.title,
                            'timestamp': self.timestamp,
                              
                            'username': self.username,
                            'registered': self.registered,
                            'user_id': self.user_id,
                              
                            'comment': self.comment,
                             #'prev_text': self.prev_text,
                             #'text': self.text,
                            'minor': self.minor,
                            'modif': ''.join(self.modif_list2),
                            'modif_remove': ''.join(self.modif_remove),
                            'modif_add': ''.join(self.modif_add),
                            'id': self.id,
                            'parentid': self.parentid,
                            'format': self.format, 
                            'model': self.model,
                            
                            'filt_coher': self.filt_coher,
                            'filt_contradic': self.filt_contradic,
                            'filt_faux': self.filt_faux,
                            'filt_vraise': self.filt_vraise,
                            'filt_erron': self.filt_erron,
                            
                            'filt_erreur': self.filt_erreur,
                            'filt_tromp': self.filt_tromp,
                            'filt_confli': self.filt_confli,
                            'filt_sens': self.filt_sens,
                            'filt_ortho': self.filt_ortho,
                            
                            'filt_gramma': self.filt_gramma,
                            'filt_typo': self.filt_typo,
                            'filt_frappe': self.filt_frappe,
                            'filt_revert': self.filt_revert,
                            'filt_vandalisme': self.filt_vandalisme,
                              
                            'filt_bot': self.filt_bot,
                              
                            'label_incoherence': self.label_incoherence # by default 0
                            }]

                self.prev_text = self.text # overwrite content of prev_text with text
                self.modif_list = []
                self.modif_str = "" # initialize modif string
                self.modif_list2 = []
                self.modif_remove = []
                self.modif_add = []
                self.minor = False # reset minor dummy
                self.text = "" # reset text string
                i+=1 # increment id for next pair

                self.prev_title = self.title
                self.prev_timestamp = self.timestamp
                self.prev_username = self.username
                self.prev_comment = self.comment
                self.comment = ""
                self.format = ""
                self.model = ""
                self.user_id = ""
                # BEWARE OF INDENTATION LEVEL WRT IF-STATEMENTS ABOVE !!!

            init = False # initialization already done
            
        
        
    # Call when a character is read
    def characters(self, content):
        #print('characters')
        if init == True : 
            
            if self.CurrentData == "title" and content.isspace() == False :   
                self.prev_title = content
                self.title = content
            elif self.CurrentData == "timestamp" and content.isspace() == False :
                self.prev_timestamp = content
            elif self.CurrentData == "username" and content.isspace() == False :
                self.prev_username = content
                self.registered = True
            elif self.CurrentData == "ip" and content.isspace() == False :
                self.prev_username = content
                self.registered = False
            elif self.CurrentData == "comment" and content.isspace() == False :
                self.prev_comment = content
            elif self.CurrentData == 'text' : #and content.isspace() == False :
                self.prev_text += ""
            elif self.CurrentData == "minor":
                self.minor = True
            elif self.CurrentData == "id" and content.isspace() == False :
                if self.inside_contributor == False :
                    self.id = content
                if self.inside_contributor == True :
                    self.user_id = content
            elif self.CurrentData == "parentid" and content.isspace() == False :
                self.parentid = content
            elif self.CurrentData == "format" and content.isspace() == False :
                self.format = content
            elif self.CurrentData == "model" and content.isspace() == False :
                self.model = content
        else:
            
            if self.CurrentData == "title" and content.isspace() == False :   
                self.title = content
            elif self.CurrentData == "timestamp" and content.isspace() == False :
                self.timestamp = content
            elif self.CurrentData == "username" and content.isspace() == False :
                self.username = content
                self.registered = True
            elif self.CurrentData == "ip" and content.isspace() == False :
                self.username = content
                self.registered = False
            elif self.CurrentData == "comment" and content.isspace() == False :
                self.comment = content
            elif self.CurrentData == 'text' : #and content.isspace() == False :           
                self.text += content
            elif self.CurrentData == "minor":
                self.minor = True
            elif self.CurrentData == "id" and content.isspace() == False :
                if self.inside_contributor == False :
                    self.id = content
                if self.inside_contributor == True :
                    self.user_id = content
            elif self.CurrentData == "parentid" and content.isspace() == False :
                self.parentid = content
            elif self.CurrentData == "format" and content.isspace() == False :
                self.format = content
            elif self.CurrentData == "model" and content.isspace() == False :
                self.model = content

#'text xml:space="preserve"'
# https://stackoverflow.com/questions/2405292/how-to-check-if-text-is-empty-spaces-tabs-newlines-in-python            
            

def main(filename):
    xml.sax.parse(open(filename), WikiEditHandler())

#### BUILDS ONE PAIR OF TEXT VERSIONS FOR EACH ROW OF THE DATAFRAME

CPU times: user 0 ns, sys: 0 ns, total: 0 ns
Wall time: 3.1 µs


In [None]:
in_path = '/media/hdd/salaun/wikiedit/raw_input/'
list_files = ["frwiki-20170801-pages-meta-history1.xml-p3p3661","frwiki-20170801-pages-meta-history1.xml-p3662p10075","frwiki-20170801-pages-meta-history1.xml-p10076p18526","frwiki-20170801-pages-meta-history1.xml-p18527p30361","frwiki-20170801-pages-meta-history1.xml-p30362p43904","frwiki-20170801-pages-meta-history1.xml-p43905p56252","frwiki-20170801-pages-meta-history1.xml-p56253p71552","frwiki-20170801-pages-meta-history1.xml-p71553p89754","frwiki-20170801-pages-meta-history1.xml-p89755p107570","frwiki-20170801-pages-meta-history1.xml-p107571p128187","frwiki-20170801-pages-meta-history1.xml-p128188p150477","frwiki-20170801-pages-meta-history1.xml-p150478p174227","frwiki-20170801-pages-meta-history1.xml-p174228p196344","frwiki-20170801-pages-meta-history1.xml-p196345p222500","frwiki-20170801-pages-meta-history1.xml-p222501p250312","frwiki-20170801-pages-meta-history1.xml-p250313p280868","frwiki-20170801-pages-meta-history1.xml-p280869p307896","frwiki-20170801-pages-meta-history1.xml-p307897p329059","frwiki-20170801-pages-meta-history1.xml-p329060p356265","frwiki-20170801-pages-meta-history1.xml-p356266p389955","frwiki-20170801-pages-meta-history1.xml-p389956p412301","frwiki-20170801-pages-meta-history2.xml-p412304p449997","frwiki-20170801-pages-meta-history2.xml-p449998p485915","frwiki-20170801-pages-meta-history2.xml-p485916p527163","frwiki-20170801-pages-meta-history2.xml-p527164p564312","frwiki-20170801-pages-meta-history2.xml-p564313p612957","frwiki-20170801-pages-meta-history2.xml-p612958p662695","frwiki-20170801-pages-meta-history2.xml-p662696p715915","frwiki-20170801-pages-meta-history2.xml-p715916p770350","frwiki-20170801-pages-meta-history2.xml-p770351p824123","frwiki-20170801-pages-meta-history2.xml-p824124p876600","frwiki-20170801-pages-meta-history2.xml-p876601p926793","frwiki-20170801-pages-meta-history2.xml-p926794p985318","frwiki-20170801-pages-meta-history2.xml-p985319p1045696","frwiki-20170801-pages-meta-history2.xml-p1045697p1113860","frwiki-20170801-pages-meta-history2.xml-p1113861p1186656","frwiki-20170801-pages-meta-history2.xml-p1186657p1252585","frwiki-20170801-pages-meta-history2.xml-p1252586p1312869","frwiki-20170801-pages-meta-history2.xml-p1312870p1382438","frwiki-20170801-pages-meta-history2.xml-p1382439p1454699","frwiki-20170801-pages-meta-history2.xml-p1454700p1557686","frwiki-20170801-pages-meta-history2.xml-p1557687p1647888","frwiki-20170801-pages-meta-history3.xml-p1647895p1740137","frwiki-20170801-pages-meta-history3.xml-p1740138p1843398","frwiki-20170801-pages-meta-history3.xml-p1843399p1939691","frwiki-20170801-pages-meta-history3.xml-p1939692p2031436","frwiki-20170801-pages-meta-history3.xml-p2031437p2286439","frwiki-20170801-pages-meta-history3.xml-p2286440p2501692","frwiki-20170801-pages-meta-history3.xml-p2501693p2730817","frwiki-20170801-pages-meta-history3.xml-p2730818p2832611","frwiki-20170801-pages-meta-history3.xml-p2832612p2941309","frwiki-20170801-pages-meta-history3.xml-p2941310p3055771","frwiki-20170801-pages-meta-history3.xml-p3055772p3166492","frwiki-20170801-pages-meta-history3.xml-p3166493p3305127","frwiki-20170801-pages-meta-history3.xml-p3305128p3438516","frwiki-20170801-pages-meta-history3.xml-p3438517p3561507","frwiki-20170801-pages-meta-history3.xml-p3561508p3682191","frwiki-20170801-pages-meta-history3.xml-p3682192p3796516","frwiki-20170801-pages-meta-history3.xml-p3796517p3922524","frwiki-20170801-pages-meta-history3.xml-p3922525p4040465","frwiki-20170801-pages-meta-history3.xml-p4040466p4166564","frwiki-20170801-pages-meta-history3.xml-p4166565p4310744","frwiki-20170801-pages-meta-history3.xml-p4310745p4419858","frwiki-20170801-pages-meta-history4.xml-p4419861p4565226","frwiki-20170801-pages-meta-history4.xml-p4565227p4699279","frwiki-20170801-pages-meta-history4.xml-p4699280p4830738","frwiki-20170801-pages-meta-history4.xml-p4830739p4970702","frwiki-20170801-pages-meta-history4.xml-p4970703p5140620","frwiki-20170801-pages-meta-history4.xml-p5140621p5314075","frwiki-20170801-pages-meta-history4.xml-p5314076p5483062","frwiki-20170801-pages-meta-history4.xml-p5483063p5659533","frwiki-20170801-pages-meta-history4.xml-p5659534p5847437","frwiki-20170801-pages-meta-history4.xml-p5847438p6039053","frwiki-20170801-pages-meta-history4.xml-p6039054p6222597","frwiki-20170801-pages-meta-history4.xml-p6222598p6400400","frwiki-20170801-pages-meta-history4.xml-p6400401p6590856","frwiki-20170801-pages-meta-history4.xml-p6590857p6819443","frwiki-20170801-pages-meta-history4.xml-p6819444p7038380","frwiki-20170801-pages-meta-history4.xml-p7038381p7289658","frwiki-20170801-pages-meta-history4.xml-p7289659p7574884","frwiki-20170801-pages-meta-history4.xml-p7574885p7876321","frwiki-20170801-pages-meta-history4.xml-p7876322p8213040","frwiki-20170801-pages-meta-history4.xml-p8213041p8618680","frwiki-20170801-pages-meta-history4.xml-p8618681p9029448","frwiki-20170801-pages-meta-history4.xml-p9029449p9413397","frwiki-20170801-pages-meta-history4.xml-p9413398p9773234","frwiki-20170801-pages-meta-history4.xml-p9773235p10164904","frwiki-20170801-pages-meta-history4.xml-p10164905p10637745","frwiki-20170801-pages-meta-history4.xml-p10637746p11019513"]
# make sure all files in list_files are present in the path specified by in_path

out_path = '/media/hdd/salaun/wikiedit/tsv_output/'

In [None]:
%%time
#%%monitor_cell wiki_xml_auto # https://hyperdash.io/

########################################################################
# AUTOMATIC ITERATOR
# https://dumps.wikimedia.org/frwiki/20170801/

for k in range(66,69): # cardinal span 1>89 || ordinal span 0>88
    # it is preferable to process a small amount of files when setting the range
    t0 = time()
    i = 0 # nbr id of pair
    j = 0 # check nbr revision
    iteration = 20000
    init = True
    list_dic = []

    print('Input file {}: '.format(k) + list_files[k])

    file = list_files[k]
    filename = in_path + file                       
    
    print('Processing:', filename)
    main(filename)
    print('Total length of list_dic: {:,d}'.format(len(list_dic)))
    
    # convert output to pandas df, assign column names
    df = pd.DataFrame(list_dic)
    
    df = df[['id_file', 'id_modif', 'label_incoherence',  
             'username', 'user_id', 'registered', 
             'format', 'model', 'id', 'parentid',
             'title', 'minor', 'comment', 'modif', 'modif_remove', 'modif_add', 'timestamp', # 17
             'filt_bot', 'filt_coher', 'filt_confli', 'filt_contradic', 'filt_erreur', # 16
             'filt_erron', 'filt_faux', 'filt_frappe', 'filt_gramma', 'filt_ortho',
             'filt_revert', 'filt_sens', 'filt_tromp', 'filt_typo', 'filt_vandalisme',
             'filt_vraise']]
    
    print('Saving output file {} as: '.format(k)+list_files[k])
    df.to_csv(out_path + '{}'.format(k)+list_files[k] + '.tsv', sep='\t')
    
    del df
    del list_dic
    
    print('End time:', str(datetime.now()))
    duration = time() - t0
    m, s = divmod(duration, 60)
    h, m = divmod(m, 60)
    print("Running time: %d:%02d:%02d" % (h, m, s), '\n')

66-meta-history4.xml-p4830739p4970702
Processing: /media/hdd/salaun/wikiedit/raw_input/frwiki-20170801-pages-meta-history4.xml-p4830739p4970702
Start time: 2017-10-09 18:01:49.632272
Number of revisions checked: 0
Number of revisions checked: 20,000
Number of revisions checked: 40,000
Number of revisions checked: 60,000
Number of revisions checked: 80,000
Number of revisions checked: 100,000
Number of revisions checked: 120,000
Number of revisions checked: 140,000
Number of revisions checked: 160,000
Number of revisions checked: 180,000
Number of revisions checked: 200,000
Number of revisions checked: 220,000
Number of revisions checked: 240,000
Number of revisions checked: 260,000
Number of revisions checked: 280,000
Number of revisions checked: 300,000
Number of revisions checked: 320,000
Number of revisions checked: 340,000
Number of revisions checked: 360,000
Number of revisions checked: 380,000
Number of revisions checked: 400,000
Number of revisions checked: 420,000
Number of rev

In [40]:
# run this cell for getting a list of memory usage

# list of usual ipython objects
ipython_vars = ['In', 'Out', 'exit', 'quit', 'get_ipython', 'ipython_vars']

# get a sorted list of the objects and their sizes
sorted([(x, sys.getsizeof(globals().get(x))) for x in dir() if not x.startswith('_') and x not in sys.modules and x not in ipython_vars], key=lambda x: x[1], reverse=True)

[('list_dic', 23728),
 ('WikiEditHandler', 2000),
 ('ContentHandler', 1056),
 ('InteractiveShell', 1056),
 ('list_files', 776),
 ('comparator', 136),
 ('list_seman', 136),
 ('main', 136),
 ('monitor_cell', 136),
 ('filename', 133),
 ('in_path', 126),
 ('list_synta', 96),
 ('np', 80),
 ('pd', 80),
 ('gmtime', 72),
 ('strftime', 72),
 ('list_comment', 64),
 ('list_page', 64),
 ('list_timestamp', 64),
 ('list_title', 64),
 ('list_username', 64),
 ('d', 56),
 ('file', 56),
 ('file_id', 56),
 ('i', 28),
 ('iteration', 28),
 ('j', 28),
 ('init', 24),
 ('k', 24)]