# Dataset

Let's try to generate our own dataset in a form of knowledge base.

1. Retrieve a bunch of texts
2. Get tickers
3. Lookup tickers in Wikidata
4. Map 'em with NERS

In [1]:
import sys
type(sys.path)

list

In [4]:
sys.path.append("/mnt/codeholder/code/python-playground/app_noisemon/noisemon")

## 1. Data import

In [11]:
import json
from pathlib import Path

In [12]:
input_path = Path("../data/01-filtered/market_twits_with_tickers.json")
data = json.loads(input_path.read_text())

In [13]:
len(data)

88357

In [14]:
data[38]

'AFLT\nРостех» планирует продать свою долю в «Аэрофлоте» (3,5%) целиком, а не частями.\nПакет будет выставлен на продажу, когда цена акций вырастет хотя бы до 182 руб. за бумагу, передает ТАСС.'

## 2. Extract tickers

In [16]:
import reticker
extractor = reticker.TickerExtractor()

In [17]:
dataset = []
for text in data:
    dataset.append({
        "text": text, 
        "tickers": extractor.extract(text)
    })

## 3. Lookup tickers in wikidata

In [18]:
dataset[56]

{'text': 'ETH\nМартин Свенде, разработчик службы безопасности Эфириума - "Чтобы разморозить счета Parity, понадобится новый хардфорк Эфириума"',
 'tickers': ['ETH']}

In [184]:
from importlib import reload

In [185]:
import data_processing.wikidata as ddd
reload(ddd)

<module 'data_processing.wikidata' from '/mnt/codeholder/code/python-playground/app_noisemon/noisemon/data_processing/wikidata.py'>

In [186]:
from data_processing.wikidata import Wikidata
from functools import lru_cache
from tqdm import tqdm
import time

In [187]:
wd = Wikidata()

In [188]:
def throttle(func):
    __last_call_start = time.time()
    __timeout = 2
    def inner(*args):
        # record time since last launch
        nonlocal __last_call_start
        timeout = __last_call_start + __timeout - time.time()
        # update last launch
        
        __last_call_start = time.time()
#         print("Timeout: ", timeout)
        if timeout > 0:
            time.sleep(timeout)
            
        return func(*args)
    
    return inner 

In [189]:
@throttle
def r():
    return "ddd"

In [190]:
r()

'ddd'

In [191]:
@lru_cache(maxsize=None)
@throttle
def get_company(ticker: str):
    return wd.lookup_companies_by_ticker(ticker)

In [199]:
ticker_not_found = []
dataset_with_companies = []
for chunk in tqdm(dataset):
    companies = []
    for ticker in set(chunk["tickers"]):
        company_candidates = get_company(ticker)
        if company_candidates:
            companies += company_candidates
    result = {**chunk, "companies" : companies}
    if companies:
        dataset_with_companies.append(result)
    else:
        ticker_not_found.append(result)
        
print(f"{len(dataset_with_companies)} texts with companies, {len(ticker_not_found)} without")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 88357/88357 [00:00<00:00, 302476.33it/s]

55758 texts with companies, 32599 without





In [203]:
ticker_not_found[900]

{'text': 'НЕФТЬ - DUMB MONEY - лонги в фонде USO - 94% всех фондов торгующими USO в лонгах!!! Рекорд с 2007. Критическая зона. Статистика против лонгов.',
 'tickers': ['DUMB', 'USO'],
 'companies': []}

In [201]:
dataset_with_companies[1005]

{'text': 'BTC vs GOLD',
 'tickers': ['BTC', 'GOLD'],
 'companies': [{'id': {'type': 'uri',
    'value': 'http://www.wikidata.org/entity/Q131723'},
   'idLabel': {'xml:lang': 'ru', 'type': 'literal', 'value': 'биткойн'}},
  {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1961738'},
   'idLabel': {'xml:lang': 'ru',
    'type': 'literal',
    'value': 'Amex Gold BUGS Index'}},
  {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q1145004'},
   'idLabel': {'xml:lang': 'en',
    'type': 'literal',
    'value': 'Randgold Resources'}},
  {'id': {'type': 'uri', 'value': 'http://www.wikidata.org/entity/Q96100267'},
   'idLabel': {'xml:lang': 'en',
    'type': 'literal',
    'value': 'Visi Telekomunikasi Infrastruktur'}}]}

In [206]:
output_path = Path("../data/03-populated")

In [207]:
with open(output_path / "market_twits_with_companies_by_ticker.json", "w") as fout:
    json.dump(dataset_with_companies, fout)

In [208]:
with open(output_path / "market_twits_with_unknown_tickers.json", "w") as fout:
    json.dump(ticker_not_found, fout)

## 4. Extract NERS

In [None]:
import spacy
nlp = spacy.load("ru_core_news_lg")

## 5. Map NERS and Organizations

## 5. Form a KnowledgeBase

In [None]:
from knowledge_base.storage import MyKnowledgeBase
kb = MyKnowledgeBase()