In [None]:
import torch
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset, random_split

In [None]:
# 1. Data Preparation
data = pd.read_csv('training.csv')
tweets = data['SocialMediaFeed'].tolist()
labels = data[['NVDA', 'ING', 'SAN', 'PFE', 'CSCO']].values.tolist()

In [None]:
# 2. Preprocess the Data
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
encoded_data = tokenizer(tweets, truncation=True, padding=True, max_length=256, return_tensors="pt")
input_ids = encoded_data['input_ids']
attention_masks = encoded_data['attention_mask']
labels = torch.tensor(labels)

In [11]:
dataset = TensorDataset(input_ids, attention_masks, labels)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [13]:
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=8)
val_dataloader = DataLoader(val_dataset, batch_size=8)

In [14]:
# 3. Model Setup
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=5)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /home/ec2-user/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_

In [15]:
# 4. Training
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.MSELoss()



In [None]:
epochs = 3
for epoch in range(epochs):
    model.train()
    for batch in train_dataloader:
        input_ids, attention_mask, batch_labels = batch
        
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = loss_fn(outputs.logits, batch_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in val_dataloader:
            input_ids, attention_mask, batch_labels = batch
            
            outputs = model(input_ids, attention_mask=attention_mask)
            loss = loss_fn(outputs.logits, batch_labels)
            val_loss += loss.item()
    
    print(f"Epoch: {epoch+1}, Validation Loss: {val_loss/len(val_dataloader)}")

In [None]:
# Save the model
model.save_pretrained("./sentiment_model")
tokenizer.save_pretrained("./sentiment_model")