# Data wrangling

## Telegram export format

In [1]:
from typing import List

In [2]:
import regex
import json
from pathlib import Path

In [3]:
input_path = Path("../data/00-raw/market_twits_export.json")
input_path.exists()

True

In [5]:
data = json.loads(input_path.read_text())["messages"]

In [7]:
len(data)

150540

In [6]:
data[5]

{'id': 6,
 'type': 'message',
 'date': '2017-11-09T12:55:55',
 'from': 'MarketTwits',
 'from_id': 'channel1203560567',
 'text': '«Россети» выступают за переход на долгосрочный тариф для возможности зарабатывать на дивиденды'}

## Filter out non-textual

In [10]:
data = list(filter(lambda x: x["text"] , data))

In [11]:
len(data)

148498

## Glue chunked

In [13]:
def glue_chunks(chunks: List)-> str:
    buffer = []
    for chunk in chunks:
        if type(chunk) == str:
            buffer.append(chunk)
        else:
            buffer.append(chunk["text"])
    buffer = " ".join(buffer)
    buffer = regex.sub(r"\s{2, }", " ", buffer)
    return buffer

In [15]:
texts = []
for i in data:
    try:
        text = i['text']
        if type(text) == list:
            text = glue_chunks(text)
        texts.append(text)
    except:
        print(i)
        break

## Filter out messages not containing a ticker

In [16]:
import reticker

In [17]:
extractor = reticker.TickerExtractor()

In [18]:
with_ticker = list(filter(extractor.extract, texts))

In [19]:
len(with_ticker)

88357

In [20]:
with_ticker[0]

'Gold demand slides to eight-year low in third quarter of 2017: WGC'

In [32]:
output_path = Path("../data/01-filtered/market_twits_with_tickers.json")
with open(output_path, "w") as fout:
    json.dump(with_ticker, fout, ensure_ascii=True)

In [22]:
output_path = Path("../data/01-filtered/market_twits.json")
with open(output_path, "w") as fout:
    json.dump(texts, fout, ensure_ascii=True)