# Dataset from NER dataset

For our task we need to collect a dataset with following properties:
1. Sentence contains just one named entity
    + of type ORG for simplicity

In [1]:
from pathlib import Path
import re
import pymongo
from tqdm import tqdm
import json
from datetime import datetime

In [2]:
client = pymongo.MongoClient()
database = client["texts"]
collection = database["news"]
projection = {"text": 1}
texts = collection.find({}, projection)
total_texts = collection.estimated_document_count()
total_texts

538362

In [3]:
news_path = Path("./data/texts.json/")

In [4]:
import spacy

In [5]:
nlp = spacy.load("ru_core_news_lg")

In [6]:
from natasha import (Segmenter, Doc)
segmenter = Segmenter()

In [7]:
data = []
for _doc in tqdm(texts, total=total_texts):
    text = _doc["text"]
    natashadoc = Doc(text)
    natashadoc.segment(segmenter)
    for idx, sentence in enumerate(natashadoc.sents):
        doc = nlp(sentence.text)
        # Count ORG entities
        org_count = 0
        for span in doc.ents:
            if span.label_ == "ORG":
                org_count += 1
        if org_count != 1:
            continue
        # We for sure have 1 ORG, need to take it
        ent = None    
        for span in doc.ents:
            if span.label_ == "ORG":
                ent = span
                break
                
        data.append({
            "text": sentence.text,
            "ent": ent.text,
            "span": [ent.start_char, ent.end_char]
        })
    
    if len(data) > 2000:
        new_path = Path(datetime.now().isoformat() + ".json")
        with new_path.open("w") as fout:
            json.dump(data, fout, ensure_ascii=False)
        data = []
    
        
        
print("Suffice: ", len(data))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 538362/538362 [12:57:08<00:00, 11.55it/s]

Suffice:  1185





In [8]:
datetime.now().isoformat()

'2022-08-15T19:50:17.603305'

In [9]:
data[-1]

{'text': "Очередь в музей - это лучше, чем\nочередь за алкоголем или в магазин за продовольствием', - отметил\nС.Капков.",
 'ent': "'",
 'span': [86, 87]}

In [11]:
for ent in doc.ents:
    print(ent, ent.label_)

' ORG
С.Капков PER
