In [1]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import TrainingArguments, Trainer
from datasets import Dataset

from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd 
import numpy as np 
import tqdm
import yfinance as yf

import data_read
import preprocessing


In [3]:
## Fine-tune bert on Autolabeled dataset
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)
model = AutoModelForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)


Some weights of the model checkpoint at yiyanghkust/finbert-pretrain were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [40]:
phrases, labels = data_read.load_fin_pharsebank()
(fin_phrases_train, fin_phrases_test, fin_y_train, fin_y_test) =\
     train_test_split(phrases, labels, test_size = 0.2, random_state = 3)

fin_dataset_train = {}
fin_dataset_train['labels'] = fin_y_train
fin_dataset_train['phrase'] = fin_phrases_train
fin_dataset_train = Dataset.from_dict(fin_dataset_train)
fin_dataset_train = fin_dataset_train.map(
    lambda examples:  tokenizer(examples["phrase"], padding="max_length", max_length=100, truncation = True, add_special_tokens = True),
    batched = True
)

training_args = TrainingArguments(output_dir="test_trainer")

from sklearn import metrics
import numpy as np 

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.accuracy_score(predictions, labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=fin_dataset_train,
    compute_metrics=compute_metrics,
)

trainer.train()

100%|██████████| 4/4 [00:00<00:00,  4.02ba/s]


'\ntraining_args = TrainingArguments(output_dir="test_trainer")\n\nfrom sklearn import metrics\nimport numpy as np \n\ndef compute_metrics(eval_pred):\n    logits, labels = eval_pred\n    predictions = np.argmax(logits, axis=-1)\n    return metrics.accuracy_score(predictions, labels)\n\ntrainer = Trainer(\n    model=model,\n    args=training_args,\n    train_dataset=fin_dataset_train,\n    compute_metrics=compute_metrics,\n)\n\ntrainer.train()'

In [42]:
finbert = pipeline("sentiment-analysis", model = model, tokenizer=tokenizer)
results = finbert(fin_phrases_test)
print(results)

In [None]:
fin_y_pred = [1 if result['label'] == 'positive' else 0 for result in results]
print("classification report:\n", metrics.classification_report(fin_y_test,fin_y_pred))

In [None]:

''' -- Generate our automatically-labelled Reuters dataset -- '''
headers = data_read.parse_news_data()

# Get S&P500 tickers and price history from YahooFinance
tickers = data_read.get_sp500_ticker_names()
symbols = yf.Tickers(" ".join(tickers.keys()))
market_data = symbols.history(interval = "1d", start="2006-10-20", end="2013-11-20", actions= False)
market_data.index = pd.to_datetime(market_data.index)

# Drop all days without trading, and all NaN columns from dataframe retrieved by yahoofinance.
# Get the 
market_data = market_data.dropna(axis=0, how = "all")
market_data= market_data.dropna(axis=1, how = "all")
_, symbols = zip(*market_data.columns)
traded_symbols = set(symbols)

# Add daily change % for each stock (EOD price)/(Start of day price) column to the dataframe. 
for sym in traded_symbols:
    price_ratio  = market_data[('Close',sym)] / market_data[('Open',sym)]
    price_ratio = 100 * (price_ratio -1)
    market_data[("Change", sym)] = price_ratio 

# Calculate SP500 unweighted index (it is the average price of all the stocks in the S&P500) '''
snp_index = market_data["Change"].sum(axis=1) / market_data["Change"].notna().sum(axis=1)
market_data[("Change","SNP_INDX")] = snp_index

'''
Build labeled news dataset:
Reterieve only the reuters news that mention companies from S&P500. For each news header keep it only
if the price of the company it talks about changes by more then +-2%. Label the obes than rose
more then two percent with 1 (positive), and the others with 0 (negative)
'''
relevant_news = data_read.get_relevant_news(traded_symbols, tickers, headers)
for news_item in relevant_news:
    date, symbol, _, _ = news_item
    if pd.to_datetime(date) not in market_data.index:
        news_item[2] = None 
        
    else:
        print("date {} sym {} ".format(date, symbol))
        ticker_day_change = market_data.loc[pd.to_datetime(date) ,("Change",symbol)]
        sp_index_change = market_data.loc[pd.to_datetime(date) ,("Change","SNP_INDX")]
        if  pd.notna(ticker_day_change) and (
                (ticker_day_change > 2 and sp_index_change < 1.2 ) 
                or 
                (ticker_day_change < -2 and sp_index_change > -1.2) 
            ) :
            news_item[2] = ticker_day_change

reuters_labeled = pd.DataFrame(relevant_news, columns = ["date", "symbol", "day_change", "header"])
reuters_labeled.set_index("date", inplace=True)
reuters_labeled = reuters_labeled.dropna(axis=0, how = "any")
reuters_labeled = reuters_labeled[ (reuters_labeled["day_change"] > 0) | (reuters_labeled["day_change"] < 0)] 
reuters_labeled.loc[reuters_labeled["day_change"] > 0, 'day_change'] = 1
reuters_labeled.loc[reuters_labeled["day_change"] < 0, 'day_change'] = 0


In [None]:
tokenizer = AutoTokenizer.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)
reuters_model = AutoModelForSequenceClassification.from_pretrained('yiyanghkust/finbert-pretrain',num_labels=2)

In [None]:
labeled_headers = reuters_labeled['header']
labels = reuters_labeled['day_change']

ticker_re = preprocessing.get_ticker_re()
reuters_headers = []
for header in labeled_headers:
        header = preprocessing.remove_tickers(ticker_re, header)
        header = preprocessing.NER_processing(header)
        reuters_headers.append(header)
    
docs_train, docs_test, y_train, y_test = train_test_split(reuters_headers, labels, test_size = 0.2, random_state = 3)
## Fine-tune bert on Autolabeled dataset

y_train = [int(y) for y in y_train]
y_test = [int(y) for y in y_test]

dataset_train = {}
dataset_train['labels'] = y_train
dataset_train['phrase'] = docs_train
dataset_train = Dataset.from_dict(dataset_train)
dataset_train = dataset_train.map(
    lambda examples:  tokenizer(examples["phrase"], padding="max_length", max_length=100, truncation = True),
    batched = True
)

print(dataset_train)
#print(dataset_train['labels'])  #'token_type_ids', 'attention_mask'

training_args = TrainingArguments(output_dir="test_trainer")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metrics.accuracy_score(predictions, labels)

trainer = Trainer(
    model=reuters_model,
    args=training_args,
    train_dataset=dataset_train,
    compute_metrics=compute_metrics,
)

trainer.train()

In [52]:
finbert = pipeline("sentiment-analysis", model = reuters_model, 
                   tokenizer=tokenizer,max_length=100, truncation=True, padding="max_length")
results = finbert(docs_test)


In [54]:
y_pred = [1 if result['label'] == 'LABEL_1' else 0 for result in results]
print("classification report:\n", metrics.classification_report(y_test,y_pred))


classification report:
               precision    recall  f1-score   support

           0       0.60      0.61      0.60       531
           1       0.54      0.53      0.54       467

    accuracy                           0.57       998
   macro avg       0.57      0.57      0.57       998
weighted avg       0.57      0.57      0.57       998

