In [1]:
from datetime import datetime as dtt

import all2graph as ag
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

Using backend: pytorch


In [2]:
asset_detail_df = pd.read_csv('asset_details.csv')
asset_detail_df

Unnamed: 0,Asset_ID,Weight,Asset_Name
0,2,2.397895,Bitcoin Cash
1,0,4.304065,Binance Coin
2,1,6.779922,Bitcoin
3,5,1.386294,EOS.IO
4,7,2.079442,Ethereum Classic
5,6,5.894403,Ethereum
6,9,2.397895,Litecoin
7,11,1.609438,Monero
8,13,1.791759,TRON
9,12,2.079442,Stellar


In [3]:
train_df = pd.read_csv('train.csv', nrows=10000)
train_df = train_df.merge(asset_detail_df, on='Asset_ID')
train_df

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Weight,Asset_Name
0,1514764860,2,40.0,2376.580000,2399.50,2357.14,2374.590000,19.233005,2373.116392,-0.004218,2.397895,Bitcoin Cash
1,1514764920,2,53.0,2374.553333,2400.90,2354.20,2372.286667,24.050259,2371.434498,-0.004079,2.397895,Bitcoin Cash
2,1514764980,2,61.0,2371.633333,2401.90,2353.70,2372.063333,42.676438,2375.442755,-0.002892,2.397895,Bitcoin Cash
3,1514765040,2,95.0,2376.060000,2406.40,2344.00,2370.566667,37.820918,2371.096152,-0.003718,2.397895,Bitcoin Cash
4,1514765100,2,33.0,2372.656667,2404.60,2343.40,2370.173333,8.519679,2370.345730,-0.002171,2.397895,Bitcoin Cash
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1514842500,11,6.0,340.320000,340.50,339.51,340.180000,8.915547,339.896497,-0.010872,1.609438,Monero
9996,1514842560,11,6.0,342.655000,345.85,339.38,342.615000,6.252500,342.649362,0.005154,1.609438,Monero
9997,1514842620,11,5.0,339.380000,339.38,338.51,338.550000,2.960035,338.659290,-0.007088,1.609438,Monero
9998,1514842680,11,8.0,342.085000,344.76,338.40,341.410000,7.420612,341.437057,0.003703,1.609438,Monero


In [4]:
def foo(df, n):
    assert df.timestamp.unique().shape[0] == df.shape[0]
    assert (df.timestamp.diff().dropna() > 0).all()
    
    output = []
    for i, row in tqdm(df.iterrows(), total=df.shape[0], ascii=True):
        sample = dict(row[['timestamp', 'Asset_ID', 'Target']])
        sub_df = df.iloc[max(i-n, 0):(i+1)].copy()
        sub_df['time_diff'] = row['timestamp'] - sub_df['timestamp']
        for col in [ 'Open', 'High', 'Low', 'Close', 'VWAP']:
            sub_df[col+'LogDiff'] = np.log(sub_df[col])
            sub_df[col+'LogDiff'] = sub_df[col+'LogDiff'].diff()
        sub_df = sub_df.iloc[1:]
        if sub_df.shape[0] > 0:
            sample['history'] = sub_df[
                ['time_diff', 'Count', 'Open', 'High', 'Low', 'Close', 'Volume', 'VWAP', 'Asset_Name',
                 'OpenLogDiff',  'HighLogDiff',  'LowLogDiff',  'CloseLogDiff',  'VWAPLogDiff']
            ].to_json(orient='records')
        output.append(sample)
        
    return output


def processing(df, n):
    output = []
    for name, group in df.groupby('Asset_Name'):
        output += foo(group, n)
    return pd.DataFrame(output)
    

In [5]:
pro_train_df = processing(train_df, 60)
pro_train_df = pro_train_df.dropna()

100%|#############################################################################| 1273/1273 [00:04<00:00, 256.21it/s]
100%|#############################################################################| 1300/1300 [00:05<00:00, 259.73it/s]
100%|#############################################################################| 1300/1300 [00:05<00:00, 224.19it/s]
100%|#############################################################################| 1154/1154 [00:04<00:00, 256.77it/s]
100%|#############################################################################| 1299/1299 [00:05<00:00, 258.46it/s]
100%|#############################################################################| 1274/1274 [00:04<00:00, 257.18it/s]
100%|#############################################################################| 1299/1299 [00:05<00:00, 257.75it/s]
100%|#############################################################################| 1101/1101 [00:04<00:00, 259.50it/s]


In [6]:
pro_train_df, pro_valid_df = train_test_split(pro_train_df, test_size=0.2)

In [7]:
pro_train_df.to_csv('pro_train.csv', index=False)
pro_valid_df.to_csv('pro_valid.csv', index=False)

In [8]:
train_meta_df = ag.split_csv(
    src='pro_train.csv',
    dst='pro_train',
    chunksize=100,
    meta_cols=['timestamp', 'Asset_ID', 'Target']
)
valid_meta_df = ag.split_csv(
    src='pro_valid.csv',
    dst='pro_valid',
    chunksize=100,
    meta_cols=['timestamp', 'Asset_ID', 'Target']
)

11it [00:00, 11.44it/s, spliting csv]
3it [00:00, 11.91it/s, spliting csv]


In [9]:
train_meta_df = pd.read_csv('pro_train_meta.csv')
valid_meta_df = pd.read_csv('pro_valid_meta.csv')

In [10]:
data_parser = ag.json.JsonParser(
    json_col='history',
    time_col='timestamp',
    list_dst_degree=0,
    r_list_inner_degree=0
)

factory = ag.Factory(
    data_parser=data_parser,
    raw_graph_parser_config=dict(
        targets=['Target']
    )
)
factory.analyse('pro_train')

17it [00:02,  6.21it/s, reading csv]
100%|##########################################################| 13/13 [00:00<00:00, 296.14it/s, reducing meta numbers]
100%|###########################################################| 13/13 [00:00<?, ?it/s, reducing meta numbers phase 2]
100%|##############################################################| 2/2 [00:01<00:00,  1.44it/s, reducing meta string]
100%|######################################################| 2/2 [00:01<00:00,  1.52it/s, reducing meta string phase 2]
100%|##############################################################| 15/15 [00:01<00:00, 10.58it/s, reducing meta name]
100%|######################################################| 15/15 [00:01<00:00, 10.86it/s, reducing meta name phase 2]

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\cxt\AppData\Local\Temp\jieba.cache





Loading model cost 0.514 seconds.
Prefix dict has been built successfully.


MetaInfo(num_strings=MetaString(num_strings=2), num_numbers=13, num_keys=15, num_etypes=29)

In [11]:
model = factory.produce_model(
    d_model=8,
    nhead=2,
    num_layers=[1, 1, 1, 1, 1],
    mock=True
)
model.cuda()

EncoderMetaLearnerMocker(
  num_parameters=56704
  target_bias_0: (1, 16, 1)
  target_bias_1: (1, 16, 1)
  target_bias_2: (1, 16, 1)
  target_bias_3: (1, 16, 1)
  target_bias_4: (1, 16, 1)
  number_weight: (16, 2, 4)
  key_bias: (16, 2, 4)
  query: (5, 16, 2, 4)
  node_bias: (5, 16, 2, 4)
  src_key_bias: (5, 30, 2, 4)
  dst_key_bias: (5, 30, 2, 4)
  src_value_bias: (5, 30, 2, 4)
  dst_value_bias: (5, 30, 2, 4)
  target_weight_0: (1, 16, 2, 4)
  target_weight_1: (1, 16, 2, 4)
  target_weight_2: (1, 16, 2, 4)
  target_weight_3: (1, 16, 2, 4)
  target_weight_4: (1, 16, 2, 4)
  target_hidden_bias_0_0: (1, 16, 2, 4)
  target_hidden_bias_0_1: (1, 16, 2, 4)
  target_hidden_bias_0_2: (1, 16, 2, 4)
  target_hidden_bias_0_3: (1, 16, 2, 4)
  target_hidden_bias_0_4: (1, 16, 2, 4)
  node_weight: (5, 16, 2, 4, 8)
  src_key_weight: (5, 30, 2, 4, 8)
  dst_key_weight: (5, 30, 2, 4, 8)
  src_value_weight: (5, 30, 2, 4, 8)
  dst_value_weight: (5, 30, 2, 4, 8)
  target_hidden_weight_0_0: (1, 16, 2, 4, 8)


In [12]:
train_data = factory.produce_dataloader(
    meta_df=train_meta_df,
    batch_size=64,
    num_workers=1,
    shuffle=True
)

valid_data = factory.produce_dataloader(
    meta_df=valid_meta_df,
    batch_size=64,
    num_workers=1,
    shuffle=True
)

In [13]:
def get_metric(x):
    return x['r2_score']['Target']

In [14]:
early_stop = ag.nn.EarlyStop(rounds=5, higher=True, json_path=get_metric)
trainer = ag.nn.Trainer(
    module=model,
    loss=ag.nn.DictLoss(torch.nn.L1Loss()),
    data=train_data,
    valid_data=[valid_data],
    metrics={'r2_score': ag.Metric(r2_score, label_first=True)},
    check_point='check_point',
    early_stop = early_stop
)
trainer.fit(2)

epoch 1 train: 100%|########################################################| 17/17 [00:15<00:00,  1.08it/s, loss=0.35]
epoch 1 val 0: 100%|#####################################################################| 5/5 [00:03<00:00,  1.29it/s]
epoch 1 train metrics: {"r2_score": {"Target": -2835.294}}
epoch 1 val 0 metrics: {"r2_score": {"Target": -14.988}}
save at "check_point.0.2.2\20220111235005.313668\1.all2graph.trainer"
current_epoch=1, current_metric=-14.988, best_epoch=1, best_metric=-14.988
epoch 2 train: 100%|#######################################################| 17/17 [00:14<00:00,  1.17it/s, loss=0.277]
epoch 2 val 0: 100%|#####################################################################| 5/5 [00:03<00:00,  1.31it/s]
epoch 2 train metrics: {"r2_score": {"Target": -1811.489}}
epoch 2 val 0 metrics: {"r2_score": {"Target": -29.427}}
save at "check_point.0.2.2\20220111235005.313668\2.all2graph.trainer"
current_epoch=2, current_metric=-29.427, best_epoch=1, best_metric=-14.98