In [1]:
import torch
from torch.utils.data import DataLoader, TensorDataset, Subset
from torch import nn
from transformers import BertTokenizer, BertModel, BertPreTrainedModel, BertConfig
from torch.optim import AdamW
import pandas as pd
from sklearn.model_selection import KFold, train_test_split
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load and tokenize data

In [2]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
max_length = 128

def load_data(csv_file):
    df = pd.read_csv(csv_file)
    texts = df["text"].tolist()

    for text in texts:
        if type(text) != type("test"):
            print(text, type(text))
    
    labels = torch.tensor(df["label"].tolist(), dtype=torch.float)

    encodings = tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    dataset = TensorDataset(encodings["input_ids"], encodings["attention_mask"], labels)
    return dataset

dataset = load_data("fineTuning3.csv")  # single CSV with all samples

## Define regression model

In [3]:
class BertForRegression(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.bert = BertModel(config)
        self.regressor = nn.Linear(config.hidden_size, 1)
        self.init_weights()

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        return self.regressor(pooled_output).squeeze(-1)

## Train/test split (80/20) + K-Fold CV

In [4]:
def run_training(dataset, k_folds=5, num_epochs=3, batch_size=16):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    
    kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

    fold_results = []

    for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
        print(f"\n----- Fold {fold+1} / {k_folds} -----")

        train_subsampler = Subset(dataset, train_ids)
        test_subsampler = Subset(dataset, test_ids)

        train_loader = DataLoader(train_subsampler, batch_size=batch_size, shuffle=True)
        test_loader = DataLoader(test_subsampler, batch_size=batch_size)

        config = BertConfig.from_pretrained("bert-base-uncased")
        model = BertForRegression.from_pretrained("bert-base-uncased", config=config).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5)
        criterion = nn.MSELoss()

        # Training
        for epoch in range(num_epochs):
            model.train()
            total_loss = 0
            for input_ids, attention_mask, labels in tqdm(train_loader, desc=f"Fold {fold+1} Epoch {epoch+1}"):
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

                optimizer.zero_grad()
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()

                total_loss += loss.item()
            print(f"Fold {fold+1} Epoch {epoch+1} Train Loss: {total_loss/len(train_loader):.4f}")

        # Evaluation
        model.eval()
        mse_total = 0
        with torch.no_grad():
            for input_ids, attention_mask, labels in test_loader:
                input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)
                outputs = model(input_ids, attention_mask)
                mse_total += criterion(outputs, labels).item() * input_ids.size(0)

        mse_total /= len(test_subsampler)
        fold_results.append(mse_total)
        print(f"Fold {fold+1} Test MSE: {mse_total:.4f}")

    avg_mse = sum(fold_results) / len(fold_results)
    print(f"\n==== Cross-validation MSE across {k_folds} folds: {avg_mse:.4f} ====")

    # Save last model + tokenizer
    model.save_pretrained("./bert_sentiment_regression")
    tokenizer.save_pretrained("./bert_sentiment_regression")

run_training(dataset, k_folds=5, num_epochs=5)


----- Fold 1 / 5 -----


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Fold 1 Epoch 1: 100%|██████████| 118/118 [00:10<00:00, 11.40it/s]


Fold 1 Epoch 1 Train Loss: 6.7203


Fold 1 Epoch 2: 100%|██████████| 118/118 [00:09<00:00, 12.62it/s]


Fold 1 Epoch 2 Train Loss: 3.4182


Fold 1 Epoch 3: 100%|██████████| 118/118 [00:09<00:00, 12.56it/s]


Fold 1 Epoch 3 Train Loss: 1.8729


Fold 1 Epoch 4: 100%|██████████| 118/118 [00:09<00:00, 12.56it/s]


Fold 1 Epoch 4 Train Loss: 1.0219


Fold 1 Epoch 5: 100%|██████████| 118/118 [00:09<00:00, 12.56it/s]


Fold 1 Epoch 5 Train Loss: 0.6757


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 1 Test MSE: 3.4709

----- Fold 2 / 5 -----


Fold 2 Epoch 1: 100%|██████████| 118/118 [00:09<00:00, 12.63it/s]


Fold 2 Epoch 1 Train Loss: 6.0989


Fold 2 Epoch 2: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 2 Epoch 2 Train Loss: 3.3798


Fold 2 Epoch 3: 100%|██████████| 118/118 [00:09<00:00, 12.62it/s]


Fold 2 Epoch 3 Train Loss: 1.7535


Fold 2 Epoch 4: 100%|██████████| 118/118 [00:09<00:00, 12.56it/s]


Fold 2 Epoch 4 Train Loss: 0.9650


Fold 2 Epoch 5: 100%|██████████| 118/118 [00:09<00:00, 12.54it/s]


Fold 2 Epoch 5 Train Loss: 0.6238


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 2 Test MSE: 2.9468

----- Fold 3 / 5 -----


Fold 3 Epoch 1: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 3 Epoch 1 Train Loss: 6.1549


Fold 3 Epoch 2: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 3 Epoch 2 Train Loss: 3.2476


Fold 3 Epoch 3: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 3 Epoch 3 Train Loss: 1.8394


Fold 3 Epoch 4: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 3 Epoch 4 Train Loss: 1.0789


Fold 3 Epoch 5: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 3 Epoch 5 Train Loss: 0.7851


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 3 Test MSE: 3.0721

----- Fold 4 / 5 -----


Fold 4 Epoch 1: 100%|██████████| 118/118 [00:09<00:00, 12.54it/s]


Fold 4 Epoch 1 Train Loss: 6.6075


Fold 4 Epoch 2: 100%|██████████| 118/118 [00:09<00:00, 12.54it/s]


Fold 4 Epoch 2 Train Loss: 4.2545


Fold 4 Epoch 3: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 4 Epoch 3 Train Loss: 2.4112


Fold 4 Epoch 4: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 4 Epoch 4 Train Loss: 1.2232


Fold 4 Epoch 5: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 4 Epoch 5 Train Loss: 0.6889


Some weights of BertForRegression were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['regressor.bias', 'regressor.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Fold 4 Test MSE: 3.4138

----- Fold 5 / 5 -----


Fold 5 Epoch 1: 100%|██████████| 118/118 [00:09<00:00, 12.52it/s]


Fold 5 Epoch 1 Train Loss: 5.8137


Fold 5 Epoch 2: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 5 Epoch 2 Train Loss: 3.4155


Fold 5 Epoch 3: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 5 Epoch 3 Train Loss: 1.8283


Fold 5 Epoch 4: 100%|██████████| 118/118 [00:09<00:00, 12.53it/s]


Fold 5 Epoch 4 Train Loss: 1.0795


Fold 5 Epoch 5: 100%|██████████| 118/118 [00:09<00:00, 12.50it/s]


Fold 5 Epoch 5 Train Loss: 0.7559
Fold 5 Test MSE: 3.2020

==== Cross-validation MSE across 5 folds: 3.2211 ====


## Test

In [5]:
model = BertForRegression.from_pretrained("./bert_sentiment_regression")
tokenizer = BertTokenizer.from_pretrained("./bert_sentiment_regression")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

with open("./testTweets.txt", "r") as file:
    texts = file.readlines()

for text in texts:
    text = text[:-1]
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Keep only input_ids and attention_mask
    inputs = {k: v.to(device) for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
    
    with torch.no_grad():
        prediction = model(**inputs)
    
    print(f"Tweet: {text}\nPrediction: {prediction.item()/8:0.3f}\n")

Tweet: holy fucking shit the chiefs are so ass. why did taylor swift agree to marry this bum ass travis kelce? his old slow fat ass can’t do shit!
Prediction: 0.083

Tweet: Why the fuck did I watch that game, and why did I keep watching it, and why did I watch all of it
Prediction: 0.217

Tweet: They may not have been the best opponents, but through 2 weeks Dylan Raiola has thrown for 6 TDs with 0 interceptions and maintained a 78% completion percentag
Prediction: 0.776



In [None]:
import json

# Load data from full.json instead of testTweets.txt
with open("./full.json", "r") as file:
    data = json.load(file)

# List to store results for export
results = []

# Process each entry in the JSON data
for entry in data:
    text = entry["body_html"]
    inputs = tokenizer(
        text,
        padding=True,
        truncation=True,
        max_length=128,
        return_tensors="pt"
    )
    
    # Keep only input_ids and attention_mask
    inputs = {k: v.to(device) for k, v in inputs.items() if k in ["input_ids", "attention_mask"]}
    
    with torch.no_grad():
        prediction = model(**inputs)
    
    prediction_score = prediction.item()/8
    
    # Store result for export
    result_entry = {
        "text": text,
        "prediction": prediction_score,
        "timestamp": entry["created_utc"]
    }
    results.append(result_entry)
    
    print(f"Text: {text}\nPrediction: {prediction_score:0.3f}\nTimestamp: {entry['created_utc']}\n")

# Export results to JSON file
with open("./testResults.json", "w") as output_file:
    json.dump(results, output_file, indent=2)

print(f"Results exported to testResults.json with {len(results)} entries.")

Text: I’m seeing more commercial breaks and flags than football bro
Prediction: 0.541
Timestamp: 1759009329.0

Text: Who’s gonna tell Arnold that holding onto the ball so you don’t throw a pick is actually worse than throwing a pick
Prediction: 0.215
Timestamp: 1759009325.0

Text: went to grab more beers. what happened?
Prediction: 0.279
Timestamp: 1759009324.0

Text: Stinger?
Prediction: 0.522
Timestamp: 1759009284.0

Text: Watched replay and didn’t see anything on 24, just engaged with 2 blockers   Weird, hopefully just a wind knocker
Prediction: 0.490
Timestamp: 1759009257.0

Text: Can we just call the game now
Prediction: 0.432
Timestamp: 1759009237.0

Text: Man, I hate to see that. Hope he is ok.
Prediction: 0.300
Timestamp: 1759009233.0

Text: Arkansas has a brighter future  Florida has a brighter future  Clemson has a brighter future even if they keep Dabo for another 20 years  Oklahoma State had a brighter future if they kept Gundy
Prediction: 0.653
Timestamp: 1759009229.0

Tex