In [None]:
data/prepare_wiki_dataset.ipynb
data/request_categories_list.py
data/base_categories.txt
data/collect_pages.sh


# Prepare dataset of wiki IT-related articles with stanford corenlp annotation.


<div class="alert alert-block alert-info">

<ol>
  <li>Load IT-related articles from the wikipedia dump;</li>
    <ol>
      <li>Make the list of IT categories;</li>
      <li>Collect the corresponding pages.</li>
    </ol>
  <li>Process them with corenlp;</li>
    <ol>
      <li>Run the container: <code>docker run --restart=unless-stopped -ti -p 9000:9000 -d tchewik/corenlp</code>;</li>
      <li>Process the collected file;</li>
      <li>Save the result into chunks;</li>
      <li>(Optional) filter the triplets by named entities occurrence.</li>
    </ol>
</ol>


## Load IT-related articles from the wikipedia dump
### 1. Make the list of useful categories

In [None]:
! python request_categories_list.py -i base_categories.txt -o categories.txt

### 2. Collect the corresponding pages into ``it_wiki_articles.json``


```sh
sh collect_pages.sh "wiki dump name" "extended list of categories" "output directory"
```


In [None]:
! sh collect_pages.sh enwiki-latest-pages-articles.xml.bz2 categories.txt it_wiki_articles

```sh
python path2json.py "directory with collected wiki pages"
```

In [None]:
! python path2json.py -i it_wiki_articles

### 3. Process the articles using corenlp 

```sh
python corenlp_parsing.py -h "hostname" -p 9000
```

In [None]:
HOSTNAME = ''
PORT = 9001

In [None]:
! python corenlp_parsing.py -i "it_wiki_articles.json" -n $HOSTNAME -p $PORT -o "corenlp_annotations/it_wiki"

In [None]:
from pycorenlp import StanfordCoreNLP

nlp = StanfordCoreNLP(f'https://{HOSTNAME}:9001')
nlp_properties = {
    'annotators': 'tokenize,ssplit,tokenize,ssplit,pos,depparse,natlog,openie,ner',
    'outputFormat': 'json'
}

In [None]:
result = nlp.annotate("I walk through the valley of the shadows of death.", 
                      properties=nlp_properties)

#### Analysis of pages, annotation of which caused errors:

In [None]:
broken_ids

In [None]:
txt = file[file[0] == remove_ids[0]][1].values[0]
# txt = ...
result = nlp.annotate(txt, properties=nlp_properties)
result['id'] = remove_ids[0]
result['text'] = txt
new_file.append(result)
# then save new_file somewhere

In [None]:
del file, new_file

### Filter the triplets by named entities occurrence & length

In [None]:
FILTER_BY_LENGTH = 3

In [None]:
counter = 0

def filter_ner(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        for entity in sentence['entitymentions']:
            if entity['text'] in [triplet['subject'], triplet['object']]:
                openie.append(triplet)
                counter += 1
                continue
                
    return openie

def filter_ner_both(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if triplet['subjectSpan'][1] - triplet['subjectSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['relationSpan'][1] - triplet['relationSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['objectSpan'][1] - triplet['objectSpan'][0] <= FILTER_BY_LENGTH:
            for entity1 in sentence['entitymentions']:
                if entity1['text'] in triplet['subject']:
                    for entity2 in sentence['entitymentions']:
                        if entity2['text'] in triplet['object']:
                            if not triplet in openie:
                                openie.append(triplet)
                                counter += 1
                elif entity1['text'] in triplet['object']:
                    for entity2 in sentence['entitymentions']:
                        if entity2['text'] in triplet['subject']:
                            if not triplet in openie:
                                openie.append(triplet)
                                counter += 1

    return openie

def process_page(page):
    sentences = []
    for sentence in page:
        new_sentence = sentence
        new_sentence['openie'] = filter_ner_both(sentence)
        if new_sentence['openie']:
            sentences.append(new_sentence)
    return sentences

In [None]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd

data_path = 'corenlp_annotations'
result_path = 'corenlp_annotations_ner_pairs'
! mkdir $result_path

for file in tqdm(glob(data_path + '/*.json')):
    tmp = pd.read_json(file)
    tmp.sentences = tmp.sentences.map(process_page)
    tmp.to_json(file.replace(data_path, result_path), orient='values')
    
print(counter, 'triplets were extracted applying NER filtering')
! echo contains only triplets with named entities in object and subject \($counter triplets\) > $result_path/readme.txt

### Leave only named entities on both ends

In [None]:
FILTER_BY_LENGTH = 3

In [None]:
counter = 0

def filter_ner(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        for entity in sentence['entitymentions']:
            if entity['text'] in [triplet['subject'], triplet['object']]:
                openie.append(triplet)
                counter += 1
                continue
                
    return openie

def filter_ner_both(sentence):
    openie = []
    global counter
    
    for triplet in sentence['openie']:
        if triplet['subjectSpan'][1] - triplet['subjectSpan'][0] <= FILTER_BY_LENGTH \
            and triplet['relationSpan'][1] - triplet['relationSpan'][0] <= FILTER_BY_LENGTH + 10 \
            and triplet['objectSpan'][1] - triplet['objectSpan'][0] <= FILTER_BY_LENGTH:
            
            entitymentions = ' '.join([entity['text'] for entity in sentence['entitymentions']])
            if triplet['subject'] in ' '.join(entitymentions) and triplet['object'] in entitymentions:
                openie.append(triplet)
                counter += 1

    return openie

def process_page(page):
    sentences = []
    for sentence in page:
        new_sentence = sentence
        new_sentence['openie'] = filter_ner_both(sentence)
        if new_sentence['openie']:
            sentences.append(new_sentence)
    return sentences

In [None]:
from glob import glob
from tqdm.autonotebook import tqdm
import pandas as pd

data_path = 'corenlp_annotations'
result_path = 'corenlp_annotations_only_ner'
! mkdir $result_path

for file in tqdm(glob(data_path + '/*.json')):
    tmp = pd.read_json(file)
    tmp.sentences = tmp.sentences.map(process_page)
    tmp.to_json(file.replace(data_path, result_path), orient='values')
    
print(counter, 'triplets were extracted applying NER filtering')
! echo contains only triplets with named entities in object and subject \($counter triplets\) > $result_path/readme.txt

In [None]:
tmp["sentences"].iloc[4][1]["openie"]

In [None]:
tmp["sentences"].iloc[4][1]["entitymentions"]