In [1]:
import pandas as pd
import os
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaConfig
import sys

from smart_pytorch import SMARTLoss, kl_loss, sym_kl_loss
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
from torch.utils.data import (
    Dataset, 
    DataLoader, 
    RandomSampler, 
    SequentialSampler
)
import math 
from transformers.optimization import (
    AdamW, 
    get_linear_schedule_with_warmup
)
from sklearn.metrics import (
    confusion_matrix,
    matthews_corrcoef,
    accuracy_score,
    roc_curve,
    auc,
    average_precision_score,
    f1_score,
)
from scipy.special import softmax
from torch.nn import CrossEntropyLoss
from tqdm import tqdm
import datasets

In [2]:
print(torch.version.cuda)
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.backends.cudnn.enabled)
print(sys.version)

12.1
2.2.1
True
True
3.11.5 | packaged by Anaconda, Inc. | (main, Sep 11 2023, 13:26:23) [MSC v.1916 64 bit (AMD64)]


In [3]:
train_data = datasets.load_dataset('zeroshot/twitter-financial-news-sentiment', split='train')
test_data = datasets.load_dataset('zeroshot/twitter-financial-news-sentiment', split='validation')

Found cached dataset csv (C:/Users/lenovo/.cache/huggingface/datasets/zeroshot___csv/zeroshot--twitter-financial-news-sentiment-a86f08210b79b812/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)
Found cached dataset csv (C:/Users/lenovo/.cache/huggingface/datasets/zeroshot___csv/zeroshot--twitter-financial-news-sentiment-a86f08210b79b812/0.0.0/6954658bab30a358235fa864b05cf819af0e179325c740e4bc853bcc7ec513e1)


In [4]:
train_data['text']

['$BYND - JPMorgan reels in expectations on Beyond Meat https://t.co/bd0xbFGjkT',
 '$CCL $RCL - Nomura points to bookings weakness at Carnival and Royal Caribbean https://t.co/yGjpT2ReD3',
 '$CX - Cemex cut at Credit Suisse, J.P. Morgan on weak building outlook https://t.co/KN1g4AWFIb',
 '$ESS: BTIG Research cuts to Neutral https://t.co/MCyfTsXc2N',
 '$FNKO - Funko slides after Piper Jaffray PT cut https://t.co/z37IJmCQzB',
 '$FTI - TechnipFMC downgraded at Berenberg but called Top Pick at Deutsche Bank https://t.co/XKcPDilIuU',
 '$GM - GM loses a bull https://t.co/tdUfG5HbXy',
 '$GM: Deutsche Bank cuts to Hold https://t.co/7Fv1ZiFZBS',
 '$GTT: Cowen cuts to Market Perform',
 '$HNHAF $HNHPD $AAPL - Trendforce cuts iPhone estimate after Foxconn delay https://t.co/rlnEwzlzzS',
 "$HOG - Moody's warns on Harley-Davidson https://t.co/LurHBEadeU",
 '$HXL - Citing aero ties, Wells slashes PT on Hexcel https://t.co/wU5P2i8WBU',
 '$I - Intelsat cut to Market Perform at Raymond James https://t.c

In [5]:
model_name = "roberta-large"

num_labels = 3
device = torch.device("cuda")

tokenizer_name = model_name

max_seq_length = 128
train_batch_size = 8
test_batch_size = 8
warmup_ratio = 0.06
weight_decay=0.0
gradient_accumulation_steps = 1
num_train_epochs = 15
learning_rate = 1e-05
adam_epsilon = 1e-08

In [6]:
class SMARTRobertaClassificationModel(nn.Module):
    
    def __init__(self, model, weight = 0.02):
        super().__init__()
        self.model = model 
        self.weight = weight

    def forward(self, input_ids, attention_mask, labels):

        # Get initial embeddings 
        embed = self.model.roberta.embeddings(input_ids) 

        # Define eval function 
        def eval(embed):
            outputs = self.model.roberta(inputs_embeds=embed, attention_mask=attention_mask)
            pooled = outputs[0] 
            logits = self.model.classifier(pooled) 
            return logits 
        
        # Define SMART loss
        smart_loss_fn = SMARTLoss(eval_fn = eval, loss_fn = kl_loss, loss_last_fn = sym_kl_loss)
        # Compute initial (unperturbed) state 
        state = eval(embed)
        # Apply classification loss 
        loss = F.cross_entropy(state.view(-1, 3), labels.view(-1))
        # Apply smart loss 
        loss += self.weight * smart_loss_fn(embed, state)
        
        return state, loss
    
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained('./roberta_pretrained_fin')

config = RobertaConfig.from_pretrained(model_name, num_labels=num_labels)
model = AutoModelForSequenceClassification.from_pretrained('./roberta_pretrained_fin_0.5_e1', config = config)

model_smart = SMARTRobertaClassificationModel(model)

  torch.utils._pytree._register_pytree_node(
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./roberta_pretrained_fin_0.5_e1 and are newly initialized: ['classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
print('Model=\n',model_smart,'\n')

Model=
 SMARTRobertaClassificationModel(
  (model): RobertaForSequenceClassification(
    (roberta): RobertaModel(
      (embeddings): RobertaEmbeddings(
        (word_embeddings): Embedding(50265, 1024, padding_idx=1)
        (position_embeddings): Embedding(514, 1024, padding_idx=1)
        (token_type_embeddings): Embedding(1, 1024)
        (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): RobertaEncoder(
        (layer): ModuleList(
          (0-23): 24 x RobertaLayer(
            (attention): RobertaAttention(
              (self): RobertaSelfAttention(
                (query): Linear(in_features=1024, out_features=1024, bias=True)
                (key): Linear(in_features=1024, out_features=1024, bias=True)
                (value): Linear(in_features=1024, out_features=1024, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): RobertaS

In [8]:
class MyClassificationDataset(Dataset):
    
    def __init__(self, data, tokenizer):
        text, labels = data
        self.examples = tokenizer(text=text,text_pair=None,truncation=True,padding="max_length",
                                  max_length=max_seq_length,return_tensors="pt")
        self.labels = torch.tensor(labels, dtype=torch.long)
        

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]

train_examples = (train_data['text'], train_data['label'])
train_dataset = MyClassificationDataset(train_examples,tokenizer)

test_examples = (test_data['text'], test_data['label'])
test_dataset = MyClassificationDataset(test_examples,tokenizer)

In [9]:
def get_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(device) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(device)
    return inputs

train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset,shuffle=True,batch_size=train_batch_size)

test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset,sampler=test_sampler, batch_size=test_batch_size)

#Extract a batch as sanity-check
# batch = get_inputs_dict(next(iter(train_dataloader)))
# input_ids = batch['input_ids'].to(device)
# attention_mask = batch['attention_mask'].to(device)
# labels = batch['labels'].to(device)

# print(batch)

In [10]:
t_total = len(train_dataloader) // gradient_accumulation_steps * num_train_epochs
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend(
    [
        {
            "params": [
                p
                for n, p in model_smart.named_parameters()
                if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model_smart.named_parameters()
                if n not in custom_parameter_names and any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
)

warmup_steps = math.ceil(t_total * warmup_ratio)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)




In [11]:
def compute_metrics(preds, model_outputs, labels, eval_examples=None, multi_label=True):
    assert len(preds) == len(labels)
    mismatched = labels != preds
    #wrong = [i for (i, v) in zip(eval_examples, mismatched) if v.any()]
    mcc = matthews_corrcoef(labels, preds)
    acc = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average='macro')
    con_m = confusion_matrix(labels, preds, labels=[0, 1, 2])
#     scores = np.array([softmax(element)[1] for element in model_outputs])
#     fpr, tpr, thresholds = roc_curve(labels, scores)
#     auroc = auc(fpr, tpr)
#     auprc = average_precision_score(labels, scores)
    return (
        {
            **{"mcc": mcc, "acc":acc, "f1": f1},
        },
        con_m
    )

def print_confusion_matrix(result):
    print('confusion matrix:')
    print('            predicted    ')
    print('          0     |     1')
    print('    ----------------------')
    print('   0 | ',format(result['tn'],'5d'),' | ',format(result['fp'],'5d'))
    print('gt -----------------------')
    print('   1 | ',format(result['fn'],'5d'),' | ',format(result['tp'],'5d'))
    print('---------------------------------------------------')


In [12]:
torch.cuda.empty_cache()
model_smart.to(device)

PATH = "SMART_Roberta_large_FinancialTweets/exp8-0.5/"+str(6)
state_dict = torch.load(PATH)
torch.cuda.empty_cache()
state_dict

OrderedDict([('model.roberta.embeddings.position_ids',
              tensor([[  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
                        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,
                        28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,
                        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,
                        56,  57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,
                        70,  71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,
                        84,  85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,
                        98,  99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
                       112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
                       126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139,
               

In [None]:
# 查询模型中key值
# model = model_smart # Replace YourModelClass with your actual model class
# print("Model state_dict keys:")
# for key in model.state_dict().keys():
#     print(key)


model_smart.load_state_dict(state_dict, strict=False)
model_smart.zero_grad()

# Initialize GradScaler for mixed precision training
from torch.cuda.amp import autocast, GradScaler
scaler = GradScaler()

for epoch in range(num_train_epochs):

    model_smart.train()
    epoch_loss = []

    for batch in tqdm(train_dataloader):
        batch = get_inputs_dict(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        # Original Code
        # logits, loss = model_smart(input_ids, attention_mask=attention_mask, labels=labels)
        # loss.backward()
        # optimizer.step()
        
        # Use mixed precision training
        with autocast():
            logits, loss = model_smart(input_ids, attention_mask=attention_mask, labels=labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        # Free up GPU memory
        torch.cuda.empty_cache()
        
        scheduler.step()
        model_smart.zero_grad()
        epoch_loss.append(loss.item())

#    SAVE
    PATH = "SMART_Roberta_large_FinancialTweets/exp8-0.5/"+str(epoch)
    torch.save(model_smart.state_dict(), PATH)

    print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))


 21%|██        | 247/1193 [43:44<2:47:58, 10.65s/it]

In [None]:
model_smart.to(device)

for epoch in range(15):
    
    #evaluate model with test_df at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_dataloader)
    preds = np.empty((len(test_dataset), num_labels))
    out_label_ids = np.empty((len(test_dataset)))
    PATH = "SMART_Roberta_large_FinancialTweets/exp1/"+str(epoch)
    model_smart.load_state_dict(torch.load(PATH))
    model_smart.eval()
    
    for i,test_batch in enumerate(test_dataloader):
#         with torch.no_grad():
        test_batch = get_inputs_dict(test_batch)
        input_ids = test_batch['input_ids'].to(device)
        attention_mask = test_batch['attention_mask'].to(device)
        labels = test_batch['labels'].to(device)
        logits, tmp_eval_loss = model_smart(input_ids, attention_mask=attention_mask, labels=labels)
    #             tmp_eval_loss, logits = outputs[:2]
        eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
#         print(logits)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, con_m = compute_metrics(preds, model_outputs, out_label_ids)
    
    #print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
    print('epoch',epoch,'Testing  avg loss',eval_loss)
    print(result)
    print(con_m)
    print('---------------------------------------------------\n')

In [None]:
# df_sample =  pd.read_csv("../data/tweets/stockerbot-export-test-2.csv")

df_sample =  pd.read_csv("../data/tweets.csv")
df_sample

new_labels=np.zeros(704)

# for l in df_sample['label'].tolist():
#     if l == 2:
#         new_labels.append(1)
#     elif l==1:
#         new_labels.append(2)
#     else:
#         new_labels.append(0)
# print(new_labels)
sample_examples = (df_sample['text'].astype(str).tolist(), new_labels)
# sample_examples = (df_sample['clean_text'].astype(str).tolist(), new_labels)
sample_dataset = MyClassificationDataset(sample_examples,tokenizer)

sample_dataloader = DataLoader(sample_dataset,shuffle=False,batch_size=test_batch_size)

print(sample_dataloader)

In [None]:
model_smart.to(device)
pred_final = []
for epoch in range(5,7):
    
    #evaluate model with test_df at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(sample_dataloader)
    preds = np.empty((len(sample_dataset), num_labels))
    out_label_ids = np.empty((len(sample_dataset)))
    
    PATH = "SMART_Roberta_large_FinancialTweets/exp8-0.5/"+str(epoch)
    model_smart.load_state_dict(torch.load(PATH))
    model_smart.eval()
    
    for i,test_batch in enumerate(sample_dataloader):
#         with torch.no_grad():
        test_batch = get_inputs_dict(test_batch)
        input_ids = test_batch['input_ids'].to(device)
        attention_mask = test_batch['attention_mask'].to(device)
        labels = test_batch['labels'].to(device)
        logits, tmp_eval_loss = model_smart(input_ids, attention_mask=attention_mask, labels=labels)
    #             tmp_eval_loss, logits = outputs[:2]
        eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(sample_dataset)
#         print(logits)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, con_m = compute_metrics(preds, model_outputs, out_label_ids)
    if epoch == 6:
        pred_final = preds 
    
    #print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
#     print('epoch',epoch,'Testing  avg loss',eval_loss)
#     print(result)
#     print(con_m)
#     print('---------------------------------------------------\n')

In [None]:
pred_final_df=pd.DataFrame(pred_final)
pred_final_df.columns = ['pred_sen']
df_sample = df_sample.join(pred_final_df)

In [None]:
df_sample.to_csv("../data/tweets.csv",index=False)

In [None]:
df_sample

In [None]:
model.to(device)

model.zero_grad()

for epoch in range(num_train_epochs):

    model.train()
    epoch_loss = []
    
    for batch in tqdm(train_dataloader):
        batch = get_inputs_dict(batch)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        epoch_loss.append(loss.item())
        
#    SAVE
    PATH = "SMART_Roberta_large_FinancialTweets/exp_roberta/"+str(epoch)
    torch.save(model.state_dict(), PATH)
    
        
#     evaluate model with test_df at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_dataloader)
    preds = np.empty((len(test_dataset), num_labels))
    out_label_ids = np.empty((len(test_dataset)))
    model.eval()
    
    for i,test_batch in enumerate(test_dataloader):
        with torch.no_grad():
            test_batch = get_inputs_dict(test_batch)
            input_ids = test_batch['input_ids'].to(device)
            attention_mask = test_batch['attention_mask'].to(device)
            labels = test_batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = test_batch_size * i
        end_index = start_index + test_batch_size if i != (n_batches - 1) else len(test_dataset)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    result, con_m = compute_metrics(preds, model_outputs, out_label_ids)
    
    print('epoch',epoch,'Training avg loss',np.mean(epoch_loss))
    print('epoch',epoch,'Testing  avg loss',eval_loss)
    print(result) 
    print(con_m)
    print('---------------------------------------------------\n')
