## Reference 
   - https://github.com/rpryzant/neutralizing-bias/blob/master/harvest/crawl_revision_text.py

In [1]:
"""
Take the output of get_revision_ids.py and download
revisions from wikipedia. Store the outputs as a tsv with
columns
id       prev      next        prev    next
       (modified chunks)    (singleton chunks)
where 
    "modified chunks" = chunks on the diff page where the enclosed text was changed
    "singleton chunks" = chunks on the diff page where the enclosed text only 
                        occurs on the right or left side (inplying that the editor
                        simply deleted or added that chunk of text)
"""


import re
import sys
import csv
import operator
import numpy as np
import pythainlp
import string, pickle, os
from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize
from tqdm import tqdm
import mwparserfromhell
from bs4 import BeautifulSoup
import urllib
from urllib.request import urlopen



in_file = '../dataset/LAW/revision1.ids'

revisions = []

# special characters
separator = 0
mask_char = 1 
unknown   = 2
to_TBD    = 3
offset    = 4

# colors
class bcolors:
    HEADER = '\033[95m'
    OKBLUE = '\033[94m'
    OKGREEN = '\033[92m'
    WARNING = '\033[93m'
    FAIL = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def print_withcolor(idx, l):
    l = l.replace('\n', '')
    ins_p = re.compile(r'<ins.*?>(.*?)</ins>', re.DOTALL)
    del_p = re.compile(r'<del.*?>(.*?)</del>', re.DOTALL)
    patterns = [ins_p, del_p]
    for i,p in enumerate(patterns):
        match = re.finditer(p, l)
        if match:
            new_l = ""
            last = 0
            for m in match:
                if i == 1:
                    color = bcolors.OKBLUE
                else:
                    color = bcolors.OKGREEN
                new_l = new_l + l[last:m.start(1)] + color + m.group(1) + bcolors.ENDC
                last = m.end(1)
            new_l = new_l + l[last:]
            l = new_l
    print (bcolors.HEADER+'line '+str(idx+1)+':'+bcolors.ENDC+l)





def html2diff(html):
    prev_changed, next_changed = [],[]
    prev_deleted, next_added = [],[]
    soup = BeautifulSoup(html, 'html.parser')
    nodes = soup.find_all(class_=re.compile(r'(diff-deletedline)|(diff-addedline)|(diff-empty)'))
    div_p = re.compile(r'<div.*?>(.*)</div>', re.DOTALL)
    
    for i in range(0, len(nodes), 2):
        # skip straddeling cases
        if i + 1 >= len(nodes):
            continue
    
        node_prev = nodes[i]
        node_next = nodes[i + 1]

        # seperate  revisions into chunks that were modified,
        # chunks that were purely deleted and chunks that were purely added
        if not node_prev.div and not node_next.div:
            continue
        elif not node_prev.div:
            next_match = re.match(div_p, node_next.div.prettify(formatter=None))
            if next_match:
                next_added.append(next_match.group(1).strip())
        elif not node_next.div:
            prev_match = re.match(div_p, node_prev.div.prettify(formatter=None))
            if prev_match:
                prev_deleted.append(prev_match.group(1).strip())
        else:
            prev_match = re.match(div_p, node_prev.div.prettify(formatter=None))
            next_match = re.match(div_p, node_next.div.prettify(formatter=None))
            if prev_match and next_match:
                prev_changed.append(prev_match.group(1).strip())
                next_changed.append(next_match.group(1).strip())

    return prev_changed, next_changed, prev_deleted, next_added


def url2diff(url):
    try:
        response = urlopen(url)
        html = response.read()
        return html2diff(html)
    except Exception as e:
        print(e, file=sys.stderr)
        return [], [], [], []


def wiki_text_clean(text):
    text = ''.join([x for x in text if x in string.printable or pythainlp.thai_characters])
    text = text.replace('\n', ' ').replace('\t', ' ')
    return text

def gen_revisions(rev_ids):
    rev_size = len(rev_ids)
    success = 0
    out = {}

    for rev_id in tqdm(rev_ids):
        print('processing revision id = ' + str(rev_id), file=sys.stderr)

        url = 'https://th.wikipedia.org/wiki/?diff=' + str(rev_id)
        prevs_, nexts_, prev_deleted, next_added = url2diff(url)

        if len(prevs_) != len(nexts_):
            print('ERROR: corpus sizes not equal!', file=sys.stderr)
            continue
            
        prevs, nexts = [], []

        for pre, post in zip(prevs_, nexts_):
            prevs.append( wiki_text_clean(pre) )
            nexts.append( wiki_text_clean(post) )
        prevs_deleted = [wiki_text_clean(pre) for pre in (prev_deleted or ['no_deleted_chunks'])]
        nexts_added = [wiki_text_clean(nxt) for nxt in (next_added or ['no_added_chunks'])]


        if len(prevs) > 0 and len(nexts) > 0:
            print('...success!', file=sys.stderr)
            success += 1
            yield rev_id, prevs, nexts, prevs_deleted, nexts_added

    print('failures: ', rev_size - success, file=sys.stderr)

    return out


def go(filename):
    with open(filename, 'r', encoding = 'utf-8-sig') as f:
        rev_ids = [l.split('\t')[0] for l in f]

    for rev_id, prevs, nexts, prev_deleted, next_added in gen_revisions(rev_ids):
        revisions.append('\t'.join([
            rev_id, 
            '<EDIT-DELIM>'.join(prevs),
            '<EDIT-DELIM>'.join(nexts),
            '<EDIT-DELIM>'.join(prev_deleted),
            '<EDIT-DELIM>'.join(next_added)
        ]))


if __name__ == '__main__':
    go(in_file)

  0%|                                                                                          | 0/583 [00:00<?, ?it/s]processing revision id = 536714
  0%|▏                                                                                 | 1/583 [00:01<13:23,  1.38s/it]processing revision id = 1971517
  0%|▎                                                                                 | 2/583 [00:04<24:42,  2.55s/it]processing revision id = 99335
  1%|▍                                                                                 | 3/583 [00:06<18:58,  1.96s/it]processing revision id = 6708795
...success!
  1%|▌                                                                                 | 4/583 [00:11<31:03,  3.22s/it]processing revision id = 90748
  1%|▋                                                                                 | 5/583 [00:12<24:10,  2.51s/it]processing revision id = 566627
...success!
  1%|▊                                                                

 18%|██████████████▎                                                                 | 104/583 [05:07<16:08,  2.02s/it]processing revision id = 87846
 18%|██████████████▍                                                                 | 105/583 [05:08<13:31,  1.70s/it]processing revision id = 102015
 18%|██████████████▌                                                                 | 106/583 [05:09<12:19,  1.55s/it]processing revision id = 102016
 18%|██████████████▋                                                                 | 107/583 [05:10<10:52,  1.37s/it]processing revision id = 102018
...success!
 19%|██████████████▊                                                                 | 108/583 [05:11<09:48,  1.24s/it]processing revision id = 136030
 19%|██████████████▉                                                                 | 109/583 [05:12<09:06,  1.15s/it]processing revision id = 345652
 19%|███████████████                                                               

...success!
 27%|█████████████████████▌                                                          | 157/583 [07:06<09:01,  1.27s/it]processing revision id = 240002
...success!
 27%|█████████████████████▋                                                          | 158/583 [07:08<09:09,  1.29s/it]processing revision id = 5163892
 27%|█████████████████████▊                                                          | 159/583 [07:25<42:44,  6.05s/it]processing revision id = 9688407
...success!
 27%|█████████████████████▉                                                          | 160/583 [07:29<38:26,  5.45s/it]processing revision id = 150170
...success!
 28%|██████████████████████                                                          | 161/583 [07:30<29:54,  4.25s/it]processing revision id = 300943
 28%|██████████████████████▏                                                         | 162/583 [07:32<23:39,  3.37s/it]processing revision id = 838926
 28%|██████████████████████▎                

 45%|███████████████████████████████████▊                                            | 261/583 [12:10<07:56,  1.48s/it]processing revision id = 2135590
...success!
 45%|███████████████████████████████████▉                                            | 262/583 [12:12<08:46,  1.64s/it]processing revision id = 2142402
 45%|████████████████████████████████████                                            | 263/583 [12:13<08:19,  1.56s/it]processing revision id = 2142436
 45%|████████████████████████████████████▏                                           | 264/583 [12:15<09:06,  1.71s/it]processing revision id = 870568
 45%|████████████████████████████████████▎                                           | 265/583 [12:17<08:21,  1.58s/it]processing revision id = 4382683
...success!
 46%|████████████████████████████████████▌                                           | 266/583 [12:18<08:44,  1.65s/it]processing revision id = 4359314
...success!
 46%|████████████████████████████████████▋           

 63%|██████████████████████████████████████████████████▏                             | 366/583 [15:51<04:36,  1.27s/it]processing revision id = 3342317
...success!
 63%|██████████████████████████████████████████████████▎                             | 367/583 [15:54<06:38,  1.84s/it]processing revision id = 3124152
...success!
 63%|██████████████████████████████████████████████████▍                             | 368/583 [15:56<06:52,  1.92s/it]processing revision id = 2573984
...success!
 63%|██████████████████████████████████████████████████▋                             | 369/583 [15:58<06:16,  1.76s/it]processing revision id = 2216863
 63%|██████████████████████████████████████████████████▊                             | 370/583 [16:00<06:43,  1.90s/it]processing revision id = 1280807
...success!
 64%|██████████████████████████████████████████████████▉                             | 371/583 [16:01<05:46,  1.63s/it]processing revision id = 185072
 64%|████████████████████████████████████

 81%|████████████████████████████████████████████████████████████████▍               | 470/583 [21:56<06:25,  3.41s/it]processing revision id = 3390255
 81%|████████████████████████████████████████████████████████████████▋               | 471/583 [21:58<05:28,  2.93s/it]processing revision id = 3448236
 81%|████████████████████████████████████████████████████████████████▊               | 472/583 [21:59<04:16,  2.31s/it]processing revision id = 3450346
 81%|████████████████████████████████████████████████████████████████▉               | 473/583 [22:00<03:35,  1.96s/it]processing revision id = 393276
 81%|█████████████████████████████████████████████████████████████████               | 474/583 [22:02<03:46,  2.08s/it]processing revision id = 4606103
...success!
 81%|█████████████████████████████████████████████████████████████████▏              | 475/583 [22:04<03:29,  1.94s/it]processing revision id = 3485566
 82%|█████████████████████████████████████████████████████████████████▎      

...success!
 90%|███████████████████████████████████████████████████████████████████████▊        | 523/583 [23:37<01:58,  1.97s/it]processing revision id = 710592
...success!
 90%|███████████████████████████████████████████████████████████████████████▉        | 524/583 [23:39<01:39,  1.69s/it]processing revision id = 710658
...success!
 90%|████████████████████████████████████████████████████████████████████████        | 525/583 [23:40<01:38,  1.70s/it]processing revision id = 715327
...success!
 90%|████████████████████████████████████████████████████████████████████████▏       | 526/583 [23:42<01:31,  1.61s/it]processing revision id = 722713
...success!
 90%|████████████████████████████████████████████████████████████████████████▎       | 527/583 [23:43<01:25,  1.54s/it]processing revision id = 722715
...success!
 91%|████████████████████████████████████████████████████████████████████████▍       | 528/583 [23:44<01:18,  1.44s/it]processing revision id = 722716
...success!
 91%|█████

In [2]:
with open('../dataset/LAW/revision1.text',"w",  encoding="utf-8-sig") as f:
    i=0
    while i<len(revisions):
        for j in revisions[i]:
            f.write(j)
        if i+1<len(revisions):
            f.write("\n")
        i+=1