In [1]:
import sys
sys.path.insert(1, '../src')
import argparse
import wandb
import os
os.environ["heuristik_data_path"] = '~/ai/Heuristik/data'
import heuristik
import numpy as np
import torch


[nltk_data] Downloading package stopwords to /home/thomas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/thomas/ai/Heuristik/data/...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/thomas/ai/Heuristik/data/...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:

# Arg Parser

parser = argparse.ArgumentParser(description='Heuristik: Asset News Recommendations')
parser.add_argument('--data_version', default='3_large', type=str, help='data version Default: 3_large')
parser.add_argument('--path', default='/home/thomas/ai/asset_sentiments/Examples/heuristik_data/', type=str, help='Path to data folder.')
parser.add_argument('--timeframe', default='3 days', type=str, help='Price data time window. Default: ')
parser.add_argument('--barriers', default='5%', type=str, help='Vertical price window barriers in percent. Default: 3 days')
parser.add_argument('--nonbinary_sentiment', action='store_true', help='Enable positive/negative predictions for price. Default: 5%')
parser.add_argument('--seed', type=int, default = 4, help='Enable positive/negative predictions for price. Default: 3')
parser.add_argument('--bert_model_name', default = 'bert-base-cased', help='Give Huggingface BERT model name. Default: bert-base-cased')
parser.add_argument('--max_len', type=int, default = 50, help='Maximum number of words to keep in sample. Default: 50')
parser.add_argument('--batch_size', type=int, default = 32, help='Batch size. Default: 32')
parser.add_argument('--epochs', type=int, default = 10, help='Number of epochs. Default: 10')
parser.add_argument('--pretrained_model', default='None', type=str, help='Name of pretrained model. Options: base, AAPL, MSFT,... Default: None')
parser.add_argument('--print_freq', type=int, default = 50, help='Print frequency during training. Default: 50')
parser.add_argument('--session', default='Huggingface_dev', type=str, help='Session name for W&B.')
parser.add_argument('--dryrun', action='store_true', help='Disable W&B logging.')

args = parser.parse_args(['--data_version',  '4_long',
                          '--path', '/home/thomas/ai/Heuristik/data',
                          '--timeframe','3 days',
                          '--barriers', '5%',
                          '--epochs','10',
                          '--max_len','100',
                          '--dryrun',
                          '--batch_size','32',
                          #'--pretrained_model','BaseModel_large'
                         ])

if args.dryrun:
    os.environ['WANDB_MODE'] = 'dryrun'

wandb.init(project = 'Heuristik',  entity='tbachlechner', name = args.session+'_'+str(args.seed))

args.binary_sentiment = not args.nonbinary_sentiment

args.num_classes = 2
if not args.binary_sentiment:
    args.num_classes = 3
if args.path[-1] != '/':
    args.path = args.path+'/'
print(args)

# Set seeds
np.random.seed(args.seed)
torch.manual_seed(args.seed)

# Set device
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


model = heuristik.load_model(model_name =args.bert_model_name, n_classes = args.num_classes, pretrained = args.pretrained_model,path = args.path)
model = model.to(device)


# Load data into dataframe

data = heuristik.data(
                    timeframe = args.timeframe, 
                    data_version = args.data_version, 
                    barriers =  args.barriers,
                    binary_sentiment = args.binary_sentiment)

df = data.retrieve(symbols = ['TWTR','AMD','BP','TWTR','PTON','PYPL','ZM','ACN','FB','AAPL','AMZN','MSFT','TSLA','GOOG','NFLX','BAC','XOM','BA','IGT','BA','HTZ','UPS','MAR'],download=True)

# Load data into loaders

loaders = heuristik.prepare_loaders(df,
                          bert_model_name = args.bert_model_name, 
                          max_len = args.max_len, 
                          batch_size = args.batch_size,
                          seed = args.seed,
                        test_size = 0.1)

dl_train, dl_val, dl_test = loaders.train_val_test()


Namespace(barriers='5%', batch_size=32, bert_model_name='bert-base-cased', binary_sentiment=True, data_version='4_long', dryrun=True, epochs=10, max_len=100, nonbinary_sentiment=False, num_classes=2, path='/home/thomas/ai/Heuristik/data/', pretrained_model='None', print_freq=50, seed=4, session='Huggingface_dev', timeframe='3 days')
Downloading data.
Getting prices for HTZ.
Loading prices from file.
Downloading 5 pages from StockNews. This may take a minute...
Save data.
Data downloaded for  HTZ .
Downloading data.
Getting prices for UPS.
Downloading prices from AlphaVantage.
Extract price sentiment. Timeframe: 3 days 00:00:00. Barriers : 5.0%.
Downloading 11 pages from StockNews. This may take a minute...
Save data.
Data downloaded for  UPS .
Downloading data.
Getting prices for MAR.
Downloading prices from AlphaVantage.
Extract price sentiment. Timeframe: 3 days 00:00:00. Barriers : 5.0%.
Downloading 7 pages from StockNews. This may take a minute...
Save data.
Data downloaded for  MA

In [3]:
heuristik.train_model(epochs = args.epochs, 
                      model = model, 
                      dl_train = dl_train,
                      dl_val = dl_val, 
                      file_name = '',
                      device = device,
                      print_freq=args.print_freq)

Correct ratio: 0.12
Epoch:  0 Val. Loss: 0.044.  Acc: 44. F1:  0.213
Epoch 1/10
----------


RuntimeError: CUDA out of memory. Tried to allocate 20.00 MiB (GPU 0; 5.79 GiB total capacity; 2.44 GiB already allocated; 15.12 MiB free; 2.57 GiB reserved in total by PyTorch)