In [1]:
import sys
sys.path.insert(1, '../src')
import argparse
import wandb
import os
os.environ["heuristik_data_path"] = '~/ai/Heuristik/data'
import heuristik
import numpy as np
import torch


[nltk_data] Downloading package punkt to
[nltk_data]     /home/thomas/ai/Heuristik/data/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thomas/ai/Heuristik/data/...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# Arg Parser

parser = argparse.ArgumentParser(description='Heuristik: Asset News Recommendations')
parser.add_argument('--data_version', default='3_large', type=str, help='data version Default: 3_large')
parser.add_argument('--path', default='/home/thomas/ai/asset_sentiments/Examples/heuristik_data/', type=str, help='Path to data folder.')
parser.add_argument('--timeframe', default='3 days', type=str, help='Price data time window. Default: ')
parser.add_argument('--barriers', default='5%', type=str, help='Vertical price window barriers in percent. Default: 3 days')
parser.add_argument('--nonbinary_sentiment', action='store_true', help='Enable positive/negative predictions for price. Default: 5%')
parser.add_argument('--seed', type=int, default = 6, help='Enable positive/negative predictions for price. Default: 3')
parser.add_argument('--bert_model_name', default = 'bert-base-cased', help='Give Huggingface BERT model name. Default: bert-base-cased')
parser.add_argument('--max_len', type=int, default = 50, help='Maximum number of words to keep in sample. Default: 50')
parser.add_argument('--batch_size', type=int, default = 32, help='Batch size. Default: 32')
parser.add_argument('--epochs', type=int, default = 10, help='Number of epochs. Default: 10')
parser.add_argument('--pretrained_model', default='None', type=str, help='Name of pretrained model. Options: base, AAPL, MSFT,... Default: None')
parser.add_argument('--print_freq', type=int, default = 50, help='Print frequency during training. Default: 50')
parser.add_argument('--session', default='rezero_dev', type=str, help='Session name for W&B.')
parser.add_argument('--dryrun', action='store_true', help='Disable W&B logging.')

args = parser.parse_args(['--data_version',  '4_long',
                          '--path', '/home/thomas/ai/Heuristik/data',
                          '--timeframe','3 days',
                          '--barriers', '5%',
                          '--epochs','10',
                          '--max_len','100',
                          #'--dryrun',
                          '--batch_size','32',
                          '--pretrained_model','BaseModel_rezero'
                         ])

if args.dryrun:
    os.environ['WANDB_MODE'] = 'dryrun'

wandb.init(project = 'Heuristik',  entity='tbachlechner', name = args.session+'_'+str(args.seed))

args.binary_sentiment = not args.nonbinary_sentiment

args.num_classes = 2
if not args.binary_sentiment:
    args.num_classes = 3

print(args)

# Set seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


model = heuristik.load_model(model_name =args.bert_model_name, n_classes = args.num_classes, pretrained = 'None',path = args.path,rezero = True)
model = model.to(device)


# Load data into dataframe

data = heuristik.data(
                    timeframe = args.timeframe, 
                    data_version = args.data_version, 
                    barriers =  args.barriers,
                    binary_sentiment = args.binary_sentiment)

df = data.retrieve(symbols = ['TWTR','AMD','BP','PTON','PYPL','ZM','ACN','FB','AAPL','AMZN','MSFT','TSLA','GOOG','NFLX','BAC','XOM','BA','IGT'],download=True)


# Load data into loaders
loaders = heuristik.prepare_loaders(df,
                          bert_model_name = args.bert_model_name, 
                          max_len = args.max_len, 
                          batch_size = args.batch_size,
                          seed = args.seed,
                          test_size = 0.1)

dl_train, dl_val, dl_test = loaders.train_val_test()


Namespace(barriers='5%', batch_size=32, bert_model_name='bert-base-cased', binary_sentiment=True, data_version='4_long', dryrun=False, epochs=10, max_len=100, nonbinary_sentiment=False, num_classes=2, path='/home/thomas/ai/Heuristik/data', pretrained_model='BaseModel_rezero', print_freq=50, seed=6, session='rezero_dev', timeframe='3 days')
Successfully retrieved 43.9k samples.


In [None]:
heuristik.train_model(epochs = args.epochs, 
                      model = model, 
                      dl_train = dl_train,
                      dl_val = dl_val, 
                      file_name = 'BaseModel_rezero',
                      device = device,
                      print_freq=args.print_freq)

Correct ratio: 0.16
Epoch:  0 Val. Loss: 0.032.  Acc: 86. F1:  0.049
Epoch 1/10
----------
Batch:  50/1235. Train Loss: 0.038  Acc: 79 
Batch: 100/1235. Train Loss: 0.036  Acc: 81 
Batch: 150/1235. Train Loss: 0.033  Acc: 83 
Batch: 200/1235. Train Loss: 0.032  Acc: 84 
Batch: 250/1235. Train Loss: 0.030  Acc: 85 
Batch: 300/1235. Train Loss: 0.029  Acc: 86 
Batch: 350/1235. Train Loss: 0.029  Acc: 86 
Batch: 400/1235. Train Loss: 0.029  Acc: 86 
Batch: 450/1235. Train Loss: 0.028  Acc: 86 
Batch: 500/1235. Train Loss: 0.028  Acc: 86 
Batch: 550/1235. Train Loss: 0.028  Acc: 86 
Batch: 600/1235. Train Loss: 0.028  Acc: 86 
Batch: 650/1235. Train Loss: 0.027  Acc: 86 
Batch: 700/1235. Train Loss: 0.027  Acc: 86 
Batch: 750/1235. Train Loss: 0.027  Acc: 86 
Batch: 800/1235. Train Loss: 0.027  Acc: 87 
Batch: 850/1235. Train Loss: 0.027  Acc: 87 
Batch: 900/1235. Train Loss: 0.027  Acc: 87 
Batch: 950/1235. Train Loss: 0.027  Acc: 87 
Batch: 1000/1235. Train Loss: 0.026  Acc: 87 
Batch: 1