In [28]:
import sys
sys.path.append('..')
from sentiment_predict.predict_sentiment_tools import *

In [None]:
posts = get_targets_posts("ETH")
result = get_sentiment_score_seq(posts)
result

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from datasets import load_dataset

# Load dataset from Hugging Face
dataset = load_dataset("arad1367/Crypto_Fundamental_News")
df = pd.DataFrame(dataset['train'])

label_mapping = {"positive": 2, "neutral": 1, "negative": 0}
df["label"] = df["label"].map(label_mapping)

train_texts, temp_texts, train_labels, temp_labels = train_test_split(
    df["text"].tolist(), df["label"].tolist(), test_size=0.2, random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    temp_texts, temp_labels, test_size=0.5, random_state=42
)

print(f"Training samples: {len(train_texts)}")
print(f"Validation samples: {len(val_texts)}")
print(f"Test samples: {len(test_texts)}")

Training samples: 10200
Validation samples: 1275
Test samples: 1275


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from torch.utils.data import DataLoader

# Load tokenizer and model
model_name = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name, 
    num_labels=3,
    ignore_mismatched_sizes=True
)




Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=512)

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = CustomDataset(train_encodings, train_labels)
val_dataset = CustomDataset(val_encodings, val_labels)
test_dataset = CustomDataset(test_encodings, test_labels)

In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    save_total_limit=1,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

comet_ml version 3.43.0 is installed, but version 3.43.2 or higher is required. Please update comet_ml to the latest version to enable Comet logging with pip install 'comet-ml>=3.43.2'.


In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6011,0.534296
2,0.4408,0.489859
3,0.2779,0.432045
4,0.2885,0.418756
5,0.4277,0.364132


TrainOutput(global_step=6375, training_loss=0.48120492317162306, metrics={'train_runtime': 875.3006, 'train_samples_per_second': 58.266, 'train_steps_per_second': 7.283, 'total_flos': 2751948476550000.0, 'train_loss': 0.48120492317162306, 'epoch': 5.0})

In [9]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.3646767735481262,
 'eval_runtime': 5.8423,
 'eval_samples_per_second': 218.237,
 'eval_steps_per_second': 27.387,
 'epoch': 5.0}

In [39]:
def predict(text):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {key: val.to(device) for key, val in inputs.items()}

    model.to(device)
    model.eval()

    with torch.no_grad():
        outputs = model(**inputs)

    probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
    predicted_index = torch.argmax(probabilities).item()
    confidence = probabilities[0, predicted_index].item()
    predicted_label = list(label_mapping.keys())[list(label_mapping.values()).index(predicted_index)]

    return predicted_label, confidence

In [12]:
from sentiment_predict.predict_sentiment_tools import *

In [13]:
post = get_targets_posts("btc")

In [14]:
texts = []
for item in post:
    content = item.__dict__['selftext']
    title = item.__dict__['title']
    texts.append((title, content))

In [26]:

predict(" ".join(texts[8]))

('negative', 0.9574872255325317)

In [33]:
texts[15]

('BCH Argentina reached 50% of its Flipstarter campaign! Help us complete it and promote the use of BCH in Argentina',
 '')

In [34]:
model.save_pretrained("./deberta_sentiment")
tokenizer.save_pretrained("./deberta_sentiment")

('./deberta_sentiment\\tokenizer_config.json',
 './deberta_sentiment\\special_tokens_map.json',
 './deberta_sentiment\\spm.model',
 './deberta_sentiment\\added_tokens.json',
 './deberta_sentiment\\tokenizer.json')

In [38]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# Load tokenizer and model from the saved directory
tokenizer = AutoTokenizer.from_pretrained("./deberta_sentiment")
model = AutoModelForSequenceClassification.from_pretrained("./deberta_sentiment")

In [40]:
predict(" ".join(texts[8]))

('negative', 0.9574872255325317)