In [1]:
!pip install -q -U torch transformers accelerate datasets
!pip install -q flash-attn

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


# Load tokenizer and model
model_name = "tchen175/llama3.1-8b-newsmtsc"
model = AutoModelForCausalLM.from_pretrained(
    model_name, 
    torch_dtype=torch.float16, 
    device_map="cuda",
    attn_implementation="flash_attention_2"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)


2024-12-11 10:49:27.194341: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [7]:
alpaca_prompt = """Determine the sentiment of the following article as -1 (negative), 0 (neutral), or 1 (positive).

### Article:
{article}

### Semantic label:
"""

## Eval our finetuned model


In [10]:
def predict_sentiment_logits(text, device='cuda'):
    
    
    prompt = alpaca_prompt.format(article = text)

    # Prepare input
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=512).to(device)

    possible_outputs = ['-1', '0', '1']

    output_token_ids = [tokenizer(output, add_special_tokens=False).input_ids[0] for output in possible_outputs]
    
    # Get logits
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    last_token_logits = logits[:, -1, :]  # Shape: (batch_size, vocab_size)

    # Filter logits for the possible outputs
    filtered_logits = last_token_logits[:, output_token_ids]  # Shape: (batch_size, len(possible_outputs))

    # Print filtered logits (comment this line for performance)
    # print("Logits for possible outputs:", filtered_logits)

    # Optionally, convert logits to probabilities
    probabilities = torch.softmax(filtered_logits, dim=-1)
    # print("Probabilities for possible outputs:", probabilities)  # Commented to avoid excessive I/O

    # Get the label with the highest probability
    predicted_label_index = torch.argmax(probabilities, dim=-1).item()
    predicted_label = possible_outputs[predicted_label_index]

    
    return predicted_label


In [18]:
import os
article_path = "../stock_articles/" 

result = {}

for filename in os.listdir(article_path):

    if filename.endswith(".txt"):
        # Build the full file path
        file_path = os.path.join(article_path, filename)
        
        # Open and read the text file line by line
        with open(file_path, 'r') as file:
            score, total = 0, 0

            for line in file:
                # Run invoke on each line
                sentiment = predict_sentiment_logits(line)
                if sentiment == '1':
                    score += 1
                elif sentiment == '0':
                    score += 0.5

                total += 1
            print(score/total, filename)
            result[filename] = score/total



0.58 AAPL.txt
0.7333333333333333 AI.txt
0.64 AMZN.txt
0.7 AVGO.txt
0.52 INTC.txt
0.5 LCID.txt
0.26 NVDA.txt
0.6666666666666666 PEP.txt
0.56 PLTR.txt
0.5869565217391305 TGT.txt
0.6 UBER.txt
0.24 UNH.txt
