In [1]:
import csv
import datetime
import json
import os
opj = os.path.join

import ccxt
import numpy
import pandas as pd
import pandas_ta as ta # needed for processing chart data 
import torch
import torch.nn as nn
from tqdm import tqdm
from transformers import AlbertTokenizer, AutoTokenizer, AutoModelForSequenceClassification, pipeline

from tokenization_roberta_spm import FairSeqRobertaSentencePieceTokenizer

## Load model

In [2]:
sentiment_name = "8hours"
_hours = 8

device = torch.device('cuda')

configs = {}
def classifier_generator(config):
    tokenizer = config.tokenizer.from_pretrained(config.tokenizer_name)
    model = config.model.from_pretrained(config.model_name)
    model.to(device)
    
    def classifier(title, content):
        encoded_inputs = tokenizer(title, content, return_tensors="pt", max_length=512, truncation=True).to(device)
        with torch.no_grad():
            output = model(**encoded_inputs)
            logits = output['logits']
        res = nn.Softmax(dim=1)(logits)[0]
        return res
    
    def free_model():
        model.to('cpu')
    
    return classifier, free_model

In [3]:
class Config:
    def __init__(self, tokenizer, tokenizer_name, model, model_name):
        self.tokenizer = tokenizer
        self.tokenizer_name = tokenizer_name
        self.model = model
        self.model_name = model_name

In [4]:
### deberta
tokenizer = AutoTokenizer
tokenizer_name = "totoro4007/cryptodeberta-base-all-finetuned"
model = AutoModelForSequenceClassification
model_name = "totoro4007/cryptodeberta-base-all-finetuned"
deberta_config = Config(
    tokenizer = tokenizer,
    tokenizer_name = tokenizer_name,
    model = model,
    model_name = model_name
)
configs['deberta'] = deberta_config

In [5]:
### roberta
tokenizer = FairSeqRobertaSentencePieceTokenizer
tokenizer_name = "fairseq-roberta-all-model"
model = AutoModelForSequenceClassification
model_name = "totoro4007/cryptoroberta-base-all-finetuned"
roberta_config = Config(
    tokenizer = tokenizer,
    tokenizer_name = tokenizer_name,
    model = model,
    model_name = model_name
)
configs['roberta'] = roberta_config

In [6]:
### bert
tokenizer = AlbertTokenizer
tokenizer_name = "totoro4007/cryptobert-base-all-finetuned"
model = AutoModelForSequenceClassification
model_name = "totoro4007/cryptobert-base-all-finetuned"
bert_config = Config(
    tokenizer = tokenizer,
    tokenizer_name = tokenizer_name,
    model = model,
    model_name = model_name
)
configs['bert'] = bert_config

## Generate df

In [7]:
def res2vec(res):
    vec = [0, 0, 0]
    for r in res:
        label = int(r['label'].split('_')[-1])
        score = round(r['score'], 4)
        vec[label] = score
    return vec

def vec_sum(vec1, vec2):
    return [vec1[0]+vec2[0], vec1[1]+vec2[1], vec1[2]+vec2[2]]

def process(df):
    binance = ccxt.binance() 
    dates = df['timestamp'].values 
    timestamp = [] 
    for i in range(len(dates)):
        date_string = binance.iso8601(int(dates[i])) 
        date_string = date_string[:10] + " " + date_string[11:-5] 
        timestamp.append(date_string) 
    df['datetime'] = timestamp 
    df = df.drop(columns={'timestamp'}) 
    return df

def dates2datetimes(dates):
    res = []
    for date in dates:
        date = list(map(int, date.split('-')))
        neo = datetime.datetime(date[0], date[1], date[2], date[3])
        res.append(neo)
    return res

def closest(query, dates):
    dates = dates2datetimes(dates)
    st = 0
    en = len(dates)-1
    while True:
        mid = (st + en) // 2
        if dates[mid] < query:
            st = mid
        else:
            en = mid
        if mid == (st + en) // 2:
            break            
    return mid

def isin(query, date):
    date = list(map(int, date.split('-')))
    date = datetime.datetime(date[0], date[1], date[2], date[3])
    hour = datetime.timedelta(hours=1)
    delta = query - date
    delta = delta / hour
    if 0 < delta and delta <= _hours:
        return True
    return False

def generate(classifier_generator, config, saveas):
    classifier, free_model = classifier_generator(config)
    with open("BTC_USDT-4h_interval.json") as f: 
        d = json.load(f) 

    chart_df = pd.DataFrame(d) 
    chart_df = chart_df.rename(columns={0:"timestamp",
                                        1:"open",
                                        2:"high",
                                        3:"low", 
                                        4:"close",
                                        5:"volume"}) 

    chart_df = process(chart_df) 

    hours, days, months, years = [],[],[],[] 

    for dt in tqdm(chart_df['datetime']):
        dtobj = pd.to_datetime(dt) 
        hour = dtobj.hour 
        day = dtobj.day 
        month = dtobj.month 
        year = dtobj.year 
        hours.append(hour)
        days.append(day) 
        months.append(month)
        years.append(year) 
    
    chart_df['years'], chart_df['months'], chart_df['days'], chart_df['hours'] = years, months, days, hours
    
    high_change, low_change = [], [] 
    close = chart_df['close'].values 
    high = chart_df['high'].values 
    low = chart_df['low'].values 
    for i in range(close.shape[0]-1):
        high_delta = (high[i+1] - close[i]) / close[i] 
        low_delta = (low[i+1] - close[i]) / close[i]
        high_change.append(high_delta) 
        low_change.append(low_delta)
    high_change.append(None) 
    low_change.append(None)

    chart_df['high_delta'] = high_change
    chart_df['low_delta'] = low_change 

    chart_df['sent_0'] = 0.0
    chart_df['sent_1'] = 0.0
    chart_df['sent_2'] = 0.0

    chart_df.dropna(inplace=True) 
    
    news = {}
    news_file="full_news_labeled.csv"
    with open(news_file) as csvfile:
        reader = csv.reader(csvfile)
        for i, row in tqdm(enumerate(reader)):
            if i == 0:
                columns = row
                print(columns)
                continue
            date = f"{row[3]}-{row[4]}-{row[5]}-{row[6]}"
            text = f"{row[0]}. {row[1]}"

            res_vec = classifier(row[0], row[1])
            #res_vec = res2vec(res)

            mask = [0, 0, 0]
            mask[int(row[2])] = 1

            if date in news:
                news[date][0] += 1
                news[date][1] = vec_sum(news[date][1], res_vec)
                news[date][2] = vec_sum(news[date][2], mask)
            else:
                news[date] = [1, res_vec, mask]

            #print(date, news[date])
            #break
    
    for index, row in tqdm(chart_df.iterrows()):
        query = datetime.datetime(row['years'], row['months'], row['days'], row['hours'])
        lk = list(news.keys())
        idx = closest(query, lk)
        ks = []
        n = 0
        for k in lk[max(0, idx-5):idx]:
            if isin(query, k):
                ks.append(k)
                n += news[k][0]
        for k in ks:
            try:
                v = news[k][1].cpu().detach().numpy()
            except:
                v = news[k][1]
                for i in range(3):
                    if not isinstance(v[i], numpy.ndarray):
                        v[i] = v[i].cpu().detach().numpy()
            chart_df.at[index, 'sent_0'] += v[0] / n
            chart_df.at[index, 'sent_1'] += v[1] / n
            chart_df.at[index, 'sent_2'] += v[2] / n
            
    chart_df.set_index(pd.DatetimeIndex(chart_df['datetime']), inplace=True) 

    chart_df['bop'] = chart_df.ta.bop(lookahead=False)
    chart_df['ebsw'] = chart_df.ta.ebsw(lookahead=False) 
    chart_df['cmf'] = chart_df.ta.cmf(lookahead=False) 
    chart_df['rsi/100'] = chart_df.ta.rsi(lookahead=False) / 100 
    chart_df['vwap'] = chart_df.ta.vwap(lookahead=False) 
    chart_df['high/low'] = chart_df['high'] / chart_df['low'] 
    chart_df['close/open'] = chart_df['close'] / chart_df['open'] 
    chart_df['high/open'] = chart_df['high'] / chart_df['open'] 
    chart_df['low/open'] = chart_df['low'] / chart_df['open'] 

    chart_df['hwma'] = chart_df.ta.hwma(lookahead=False)
    chart_df['linreg'] = chart_df.ta.linreg(lookahead=False)
    chart_df['hwma/close'] = chart_df['hwma'] / chart_df['close'] 
    chart_df['linreg/close'] = chart_df['linreg'] / chart_df['close']

    for i in tqdm(range(1, 4)): 
        for col in ['open', 'high', 'low', 'close', 'volume', 'vwap']:
            val = chart_df[col].values 
            val_ret = [None for _ in range(i)] 
            for j in range(i, len(val)): 
                if val[j-i] == 0:
                    ret = 1 
                else:
                    ret = val[j] / val[j-i] 
                val_ret.append(ret) 
            chart_df['{}_change_{}'.format(col, i)] = val_ret 



    chart_df.dropna(inplace=True) 
    chart_df.drop(columns={'datetime', 'open', 'high', 'low', 'close', 'volume', 'vwap', 'hwma', 'linreg', 'years'}, inplace=True) 
    
    chart_df.to_csv(saveas)
    free_model()

## Runs

In [None]:
for name, config in configs.items():
    savedir = opj("data", f"{sentiment_name}")
    if not os.path.exists(savedir):
        os.makedirs(savedir, exist_ok=True)
    saveas = opj("data", f"{sentiment_name}", f"{name}.csv")
    generate(classifier_generator, config, saveas)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
100%|█████████████████████████████████████████████████████████████| 10560/10560 [00:01<00:00, 8064.25it/s]
3it [00:00, 28.63it/s]

['title', 'content', 'labels', 'year', 'month', 'day', 'hour']


29838it [18:25, 26.99it/s]
10559it [06:40, 26.37it/s]
100%|███████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  7.94it/s]
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RobertaTokenizer'. 
The class this function is called from is 'FairSeqRobertaSentencePieceTokenizer'.


[PASS] spm_id: madeupword0000 | fairseq_id: 51998
[PASS] spm_id: madeupword0001 | fairseq_id: 51999


100%|█████████████████████████████████████████████████████████████| 10560/10560 [00:01<00:00, 7567.17it/s]
0it [00:00, ?it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
6it [00:00, 55.41it/s]

['title', 'content', 'labels', 'year', 'month', 'day', 'hour']


26it [00:00, 58.58it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
85it [00:01, 52.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
91it [00:01, 52.41it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
114it [00:02, 36.76it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
456

4472it [01:27, 51.88it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
4576it [01:29, 40.63it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
4597it [01:29, 44.59it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
4640it [01:30, 46.81it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been remov

10557it [03:28, 45.78it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
10613it [03:29, 51.52it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
10643it [03:30, 49.43it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
10655it [03:30, 51.49it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been r

12832it [04:12, 51.70it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
13012it [04:16, 46.50it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
13138it [04:19, 55.64it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
13144it [04:19, 54.62it/s]Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been r