# Prepare dataset of wiki IT-related articles with stanford corenlp annotation.


<div class="alert alert-block alert-info">

<ol>
  <li>Load IT-related articles from the wikipedia dump;</li>
    <ol>
      <li>Make the list of IT categories;</li>
      <li>Collect the corresponding pages.</li>
    </ol>
  <li>Process them with corenlp;</li>
    <ol>
      <li>Run the container: <code>docker run --rm -ti -p 9000:9000 -d tchewik/corenlp</code>;</li>
      <li>Process the collected file;</li>
      <li>Save the result into chunks;</li>
      <li>(Optional) filter the triplets by named entities occurrence.</li>
    </ol>
</ol>


## Load IT-related articles from the wikipedia dump
### Make the list of useful categories

In [None]:
import requests

base_categories = ['Programming_languages', 'Computer_science', 'Information_technology',
                   'Algorithms', 'Formal_systems', 'Areas_of_computer_science', 'Software_development']

params = {
    'categories': base_categories,  # process the categories separately if it doesn't respond!
    'depth': 2,
    'negcats': ['Information_technology_by_country'],
    'ns[14]': 1,  # namespace=14 is for categories, 0 for pages
    'language': 'en',
    'project': 'wikipedia',
    'format': 'json',
    'doit': 'Do it!'}

r = requests.get('https://petscan.wmflabs.org/', params=params)
data = r.json()

In [None]:
stopwords = [word.strip() for word in open('stopwords.filter', 'r').readlines()] # should not appear in the names

In [None]:
categories = []

def process_title(title):
    for word in stopwords:
        if word in title.lower():
            return None
    return title

for item in data['*'][0]['a']['*']:
    title = process_title(item.get('title'))
    if title:
        categories.append(title)
        
categories.sort()

In [None]:
with open('data/categories.filter', 'w') as f:
    categories.sort()
    for category in categories:
        f.write(category + '\n')

(Sigh, update the stopword list and repeat infinitely)

In [None]:
del data, categories

### Collect the corresponding pages into ``it_wiki_articles.json``

In [None]:
%%bash -s 'enwiki-latest-pages-articles.xml.bz2'

wget http://download.wikimedia.org/enwiki/latest/$1
git clone https://github.com/attardi/wikiextractor.git

python wikiextractor/WikiExtractor.py $1 \
       --json \
       --processes 2 \
       --output extracted_2782 \
       --bytes 1M \
       --compress \
       --filter_category categories.filter \
       --links \
       --sections \
       --lists \
       --keep_tables \
       --min_text_length 0 \
       --filter_disambig_pages
       
find extracted_2782 -name '*bz2' -exec bzip2 -dkc {} \; > it_wiki_articles

In [None]:
import json

titles = []

with open('it_wiki_articles.json', 'w') as fo:
    fo.write('[')
    with open('it_wiki_articles', 'r') as f:
        file = f.readlines()
        for i, line in enumerate(file):
            if i < len(file) - 1:
                fo.write(line[:-1] + ',')
            else:
                fo.write(line + ']')
                
file = json.load(open('it_wiki_articles.json', 'rb'))

#### Filter titles

In [None]:
result = []

for line in file:
    flag = False
    for word in stopwords:
        if word in line['title'].lower():
            flag = True
            continue
    if not flag:
        result.append(line)
        
print(len(result), 'articles were loaded.')
json.dump(result, open('it_wiki_articles.json', 'w'))

In [None]:
del file, result
! rm -r wikiextractor

## Process the articles using corenlp 

In [None]:
%%bash

pip install -U pip
pip install pycorenlp

In [None]:
from pycorenlp import StanfordCoreNLP

container = # '0.0.0.0:9000'
nlp = StanfordCoreNLP(container)
nlp_properties = {
  'annotators': 'tokenize,ssplit,tokenize,ssplit,pos,depparse,natlog,openie,ner',
  'outputFormat': 'json'
}

In [None]:
import pandas as pd

file = pd.read_json('it_wiki_articles.json').sort_values('id')

In [None]:
from html.parser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    data = s.get_data().replace('\n\n', '\n')\
                        .replace('Section::::', '')\
                        .replace('BULLET::::', '')\
                        .replace('(Archive)', '')
    data = re.sub(r'style=".*"', '', data)
    return data

file.text = file.text.map(strip_tags)
file = file[file.text.map(len) < 60000]
file.to_json('it_wiki_articles.json', orient='values')
del file

In [None]:
import json
from tqdm import tqdm_notebook as tqdm

file = json.load(open('it_wiki_articles.json', 'r'))
broken_ids = []
! mkdir corenlp_annotations

new_file = []
filename_base = 'corenlp_annotations/it_wiki'
counter_0, counter_1 = 1, 0
save_every = 300

for line in tqdm(file):
    id, text, name, url = line
    if id not in remove_ids:
        result = nlp.annotate(strip_tags(text), properties=nlp_properties)
        if type(result) == str:
            broken_ids.append(id)  # in case of an error, corenlp returns a string
            continue
            
        result['id'] = id
        result['text'] = text
        new_file.append(result)

        if not counter_0 % save_every:
            json.dump(new_file, open(f'{filename_base}_part_{counter_1}.json', 'w'))
            counter_1 += 1
            new_file = []

        counter_0 += 1

json.dump(new_file, open(filename_base + f'_part_{counter_1}.json', 'w'))

#### Analysis of pages, annotation of which caused errors:

In [None]:
broken_ids

In [None]:
txt = file[file[0] == remove_ids[0]][1].values[0]
# txt = ...
result = nlp.annotate(txt, properties=nlp_properties)
result['id'] = remove_ids[0]
result['text'] = txt
new_file.append(result)
# then save new_file somewhere

In [None]:
del file, new_file

### Filter the triplets by named entities occurrence

In [None]:
counter = 0

def filter_ner(sentence):
    entitymentions = []
    openie = []
    global counter
    
    for entity in sentence['entitymentions']:
        for triplet in sentence['openie']:
            if entity['text'] in [triplet['subject'], triplet['object']]:
                openie.append(triplet)
                entitymentions.append(entity)
                counter += 1
                
    return entitymentions, openie

def process_page(page):
    sentences = []
    for sentence in page:
        new_sentence = sentence
        new_sentence['entitymentions'], new_sentence['openie'] = filter_ner(sentence)
        if new_sentence['entitymentions']:
            sentences.append(new_sentence)
    return sentences

In [None]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd

result = []

for file in tqdm(glob('corenlp_annotations/*.json')):
    tmp = pd.read_json(file)
    tmp.sentences = tmp.sentences.map(process_page)
    result.append(tmp)
    
print(counter, 'triplets were extracted applying NER filtering')
result = pd.concat(result).sort_values('id')

In [None]:
import numpy as np

directory = 'filtered_annotations'
! mkdir $directory
result = np.array_split(tmp, 4)
for i in range(len(result)):
    result[i].to_json(f'{directory}/it_wiki_annots_filtered_part_{i}.json', orient='records')
! echo contains only triplets with named entities \($counter triplets\) > filtered_annotations/readme.txt