# Prepare dataset of wiki IT-related articles with stanford corenlp annotation.


<div class="alert alert-block alert-info">

<ol>
  <li>Load IT-related articles from the wikipedia dump;</li>
    <ol>
      <li>Make the list of IT categories;</li>
      <li>Collect the corresponding pages.</li>
    </ol>
  <li>Process them with corenlp;</li>
    <ol>
      <li>Run the container: <code>docker run --restart=unless-stopped -ti -p 9000:9000 -d tchewik/corenlp</code>;</li>
      <li>Process the collected file;</li>
      <li>Save the result into chunks;</li>
      <li>(Optional) filter the triplets by named entities occurrence.</li>
    </ol>
</ol>


## Load IT-related articles from the wikipedia dump
### Make the list of useful categories

#### Make separate lists at first 

In [None]:
import requests

base_categories = ['Programming_languages', 'Computer_science', 'Information_technology',
                   'Algorithms', 'Formal_systems', 'Areas_of_computer_science', 
                   'Software_development', 'Software_engineering', 'Windows_software', 
                   'Linux', 'MacOS', 'Data_structures', 'Data_analysis', 'Big_data', 'Machine_learning', 
                   'Google', 'Microsoft', 'IBM', 'Silicon_Valley',
                   'Natural_language_processing', 'Computational_linguistics', 'Parsing']

params = {
    'categories': base_categories[-1],  # process the categories separately
    'depth': 3,
    'ns[14]': 1,  # namespace=14 is for categories, 0 for pages
    'language': 'en',
    'project': 'wikipedia',
    'format': 'json',
    'doit': 'Do it!'}

r = requests.get('https://petscan.wmflabs.org/', params=params)
data = r.json()

Number of subcategories 

In [None]:
len(data['*'][0]['a']['*'])

In [None]:
stopwords = [word.strip() for word in open('stopwords.filter', 'r').readlines()] # should not appear in the names

In [None]:
categories = []

def process_title(title):
    for word in stopwords:
        if word in title.lower():
            return None
    return title

for item in data['*'][0]['a']['*']:
    title = process_title(item.get('title'))
    if title:
        categories.append(title)
        
categories.sort()

Number of subcategories after filtering

In [None]:
len(categories)

In [None]:
categories

In [None]:
with open('parsing.categories', 'w') as f:
    categories.sort()
    for category in categories:
        f.write(category + '\n')

#### Explore the contribution of each base category in the final list 

In [None]:
with open('areas_cs.categories', 'r') as f:
    all_the_categories = [line.strip() for line in f.readlines()]
    
print(len(all_the_categories))

In [None]:
with open('comp_science.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('it.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('engineering.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('dev.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('algorithms.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('comptat_science.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('languages.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('formal_systems.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('winsoft.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('linux.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('macos.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('ai.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('da.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('ml.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('google.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('microsoft.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('ibm.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('silicon_valley.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('nlp.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('cl.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
with open('parsing.categories', 'r') as f:
    temp = [line.strip() for line in f.readlines()]

print('contribution:', len(list(set(all_the_categories + temp))) - len(all_the_categories))
all_the_categories = list(set(all_the_categories + temp))

In [None]:
len(all_the_categories)

In [None]:
with open('categories.filter', 'w') as f:
    all_the_categories.sort()
    for category in all_the_categories:
        f.write(category + '\n')

(Sigh, update the stopword list and repeat infinitely)

In [None]:
del data, categories

### Collect the corresponding pages into ``it_wiki_articles.json``

In [None]:
%%bash -s 'enwiki-latest-pages-articles.xml.bz2'

#wget http://download.wikimedia.org/enwiki/latest/$1
git clone https://github.com/attardi/wikiextractor.git

python wikiextractor/WikiExtractor.py $1 \
       --json \
       --processes 4 \
       --output extracted \
       --bytes 4M \
       --compress \
       --filter_category categories.filter \
       --min_text_length 0
       
find extracted -name '*bz2' -exec bzip2 -dkc {} \; > it_wiki_articles

In [None]:
import json

titles = []

with open('it_wiki_articles.json', 'w') as fo:
    fo.write('[')
    with open('it_wiki_articles', 'r') as f:
        file = f.readlines()
        for i, line in enumerate(file):
            if i < len(file) - 1:
                fo.write(line[:-1] + ',')
            else:
                fo.write(line + ']')

In [None]:
import pandas as pd

file = pd.read_json('it_wiki_articles.json').sort_values('id')

In [None]:
from html.parser import HTMLParser
import re

class MLStripper(HTMLParser):
    def __init__(self):
        self.reset()
        self.strict = False
        self.convert_charrefs= True
        self.fed = []
    def handle_data(self, d):
        self.fed.append(d)
    def get_data(self):
        return ''.join(self.fed)

def strip_tags(html):
    s = MLStripper()
    s.feed(html)
    data = s.get_data().replace('\n\n', '\n')\
                        .replace('Section::::', '')\
                        .replace('BULLET::::', '')\
                        .replace('(Archive)', '')\
                        .replace('( )', '')
    data = re.sub(r'style=".*"', '', data)
    return data.strip()

file.text = file.text.map(strip_tags)
#file = file[file.text.map(len) < 60000]
file.to_json('it_wiki_articles.json', orient='values')

In [None]:
file = file[file.text.map(len) < 84000]

In [None]:
len(file)

#### Filter titles

In [None]:
file

In [None]:
for index, row in file.head(n=2).iterrows():
     print(index, row)

In [None]:
result = []

for index, line in file.iterrows():
    flag = False
    for word in stopwords:
        if word in line['title'].lower():
            flag = True
            continue
    if not flag:
        result.append(line.values.tolist())
        
print(len(result), 'articles were loaded.')
#

In [None]:
json.dump(result, open('it_wiki_articles.json', 'w'))

In [None]:
del file, result
! rm -r wikiextractor

## Process the articles using corenlp 

In [None]:
%%bash

pip install -U pip pycorenlp

In [None]:
from pycorenlp import StanfordCoreNLP

container = 'http://' + hostname + ':9000'
nlp = StanfordCoreNLP(container)
nlp_properties = {
  'annotators': 'tokenize,ssplit,tokenize,ssplit,pos,depparse,natlog,openie,ner',
  'outputFormat': 'json'
}

In [None]:
broken_ids = []

In [None]:
import json
from tqdm import tqdm_notebook as tqdm

file = json.load(open('it_wiki_articles.json', 'r'))

! mkdir corenlp_annotations

new_file = []
filename_base = 'corenlp_annotations/it_wiki'
counter_0, counter_1 = 0, 0
save_every = 100

def remove_title(text, title):
    return text[len(title)+1:]

for line in tqdm(file):
    id, text, name, url = line
    if id not in broken_ids:
        try:
            result = nlp.annotate(remove_title(text, name), properties=nlp_properties)
        except Exception:
            import time
            time.sleep(10)  # wait until container surely restarts after OOM
            result = nlp.annotate(remove_title(text, name), properties=nlp_properties)
        
        if type(result) == str:
            broken_ids.append(id)  # in case of an internal error, corenlp returns a string
            continue
            
        result['id'] = id
        new_file.append(result)

        if not counter_0 % save_every:
            json.dump(new_file, open(f'{filename_base}_part_{counter_1}.json', 'w'))
            counter_1 += 1
            new_file = []

        counter_0 += 1

json.dump(new_file, open(filename_base + f'_part_{counter_1}.json', 'w'))

mkdir: cannot create directory ‘corenlp_annotations’: File exists


HBox(children=(IntProgress(value=0, max=9033), HTML(value='')))

#### Analysis of pages, annotation of which caused errors:

In [649]:
broken_ids

[]

In [None]:
txt = file[file[0] == remove_ids[0]][1].values[0]
# txt = ...
result = nlp.annotate(txt, properties=nlp_properties)
result['id'] = remove_ids[0]
result['text'] = txt
new_file.append(result)
# then save new_file somewhere

In [None]:
del file, new_file

### Filter the triplets by named entities occurrence & length

In [672]:
FILTER_BY_LENGTH = 3

In [677]:
counter = 0

def filter_ner(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        for entity in sentence['entitymentions']:
            if entity['text'] in [triplet['subject'], triplet['object']]:
                openie.append(triplet)
                counter += 1
                continue
                
    return openie

def filter_ner_both(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if triplet['subjectSpan'][1] - triplet['subjectSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['relationSpan'][1] - triplet['relationSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['objectSpan'][1] - triplet['objectSpan'][0] <= FILTER_BY_LENGTH:
            for entity1 in sentence['entitymentions']:
                if entity1['text'] in triplet['subject']:
                    for entity2 in sentence['entitymentions']:
                        if entity2['text'] in triplet['object']:
                            if not triplet in openie:
                                openie.append(triplet)
                                counter += 1
                elif entity1['text'] in triplet['object']:
                    for entity2 in sentence['entitymentions']:
                        if entity2['text'] in triplet['subject']:
                            if not triplet in openie:
                                openie.append(triplet)
                                counter += 1

    return openie

def process_page(page):
    sentences = []
    for sentence in page:
        new_sentence = sentence
        new_sentence['openie'] = filter_ner_both(sentence)
        if new_sentence['openie']:
            sentences.append(new_sentence)
    return sentences

In [678]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd

data_path = 'corenlp_annotations'
result_path = 'corenlp_annotations_ner_pairs'
! mkdir $result_path

for file in tqdm(glob(data_path + '/*.json')):
    tmp = pd.read_json(file)
    tmp.sentences = tmp.sentences.map(process_page)
    tmp.to_json(file.replace(data_path, result_path), orient='values')
    
print(counter, 'triplets were extracted applying NER filtering')
! echo contains only triplets with named entities in object and subject \($counter triplets\) > $result_path/readme.txt

mkdir: cannot create directory ‘corenlp_annotations_ner_pairs’: File exists


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))

75335 triplets were extracted applying NER filtering


### Leave only named entities on both ends

In [693]:
FILTER_BY_LENGTH = 3

In [697]:
counter = 0

def filter_ner(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        for entity in sentence['entitymentions']:
            if entity['text'] in [triplet['subject'], triplet['object']]:
                openie.append(triplet)
                counter += 1
                continue
                
    return openie

def filter_ner_both(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if triplet['subjectSpan'][1] - triplet['subjectSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['relationSpan'][1] - triplet['relationSpan'][0] <= FILTER_BY_LENGTH + 10 \
            and triplet['objectSpan'][1] - triplet['objectSpan'][0] <= FILTER_BY_LENGTH:
            
            entitymentions = ' '.join([entity['text'] for entity in sentence['entitymentions']])
            if triplet['subject'] in ' '.join(entitymentions) and triplet['object'] in entitymentions:
                openie.append(triplet)
                counter += 1

    return openie

def process_page(page):
    sentences = []
    for sentence in page:
        new_sentence = sentence
        new_sentence['openie'] = filter_ner_both(sentence)
        if new_sentence['openie']:
            sentences.append(new_sentence)
    return sentences

In [698]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd

data_path = 'corenlp_annotations'
result_path = 'corenlp_annotations_only_ner'
! mkdir $result_path

for file in tqdm(glob(data_path + '/*.json')):
    tmp = pd.read_json(file)
    tmp.sentences = tmp.sentences.map(process_page)
    tmp.to_json(file.replace(data_path, result_path), orient='values')
    
print(counter, 'triplets were extracted applying NER filtering')
! echo contains only triplets with named entities in object and subject \($counter triplets\) > $result_path/readme.txt

mkdir: cannot create directory ‘corenlp_annotations_only_ner’: File exists


HBox(children=(IntProgress(value=0, max=92), HTML(value='')))

45949 triplets were extracted applying NER filtering


In [707]:
tmp["sentences"].iloc[4][1]["openie"]

[{'subject': 'He',
  'subjectSpan': [0, 1],
  'relation': 'is professor at',
  'relationSpan': [1, 4],
  'object': 'University of California',
  'objectSpan': [11, 14]},
 {'subject': 'He',
  'subjectSpan': [0, 1],
  'relation': 'is professor at',
  'relationSpan': [1, 4],
  'object': 'University',
  'objectSpan': [11, 12]},
 {'subject': 'He',
  'subjectSpan': [0, 1],
  'relation': 'is professor at',
  'relationSpan': [1, 4],
  'object': 'Berkeley',
  'objectSpan': [15, 16]},
 {'subject': 'He',
  'subjectSpan': [0, 1],
  'relation': 'is',
  'relationSpan': [1, 2],
  'object': 'professor',
  'objectSpan': [3, 4]}]

In [709]:
tmp["sentences"].iloc[4][1]["entitymentions"]

[{'docTokenBegin': 19,
  'docTokenEnd': 20,
  'tokenBegin': 3,
  'tokenEnd': 4,
  'text': 'professor',
  'characterOffsetBegin': 90,
  'characterOffsetEnd': 99,
  'ner': 'TITLE'},
 {'docTokenBegin': 27,
  'docTokenEnd': 32,
  'tokenBegin': 11,
  'tokenEnd': 16,
  'text': 'University of California, Berkeley',
  'characterOffsetBegin': 141,
  'characterOffsetEnd': 175,
  'ner': 'ORGANIZATION'},
 {'docTokenBegin': 16,
  'docTokenEnd': 17,
  'tokenBegin': 0,
  'tokenEnd': 1,
  'text': 'He',
  'characterOffsetBegin': 82,
  'characterOffsetEnd': 84,
  'ner': 'PERSON'}]