In [1]:
import pandas as pd
import numpy as np
import torch
import pickle, warnings, datetime, pytz
from transformers import AutoTokenizer, AutoModelForSequenceClassification
warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open('../../data/processed/gdelt_intermediate_cleaned.pkl', 'rb') as f:
    df = pickle.load(f)

In [3]:
# Load model and tokenizer
model_name = "tabularisai/robust-sentiment-analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [4]:
def predict_sentiment_batch(texts, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        inputs = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=512)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        with torch.no_grad():
            outputs = model(**inputs)
        probabilities = torch.nn.functional.softmax(outputs.logits, dim=-1)
        scores = torch.tensor([-2, -1, 0, 1, 2], dtype=torch.float32).to(device)
        batch_scores = torch.matmul(probabilities, scores)
        results.extend(batch_scores.tolist())
    return results

In [5]:
df_sentiment=df['GKGRECORDID'].to_frame()
df_sentiment['llm_sentiment'] = predict_sentiment_batch(df['article_title'].tolist())

In [6]:
# Save df_sentiment to a pickle file
with open('../../data/processed/gdelt_llm_sentiment.pkl', 'wb') as f:
    pickle.dump(df_sentiment, f)

In [7]:
df = pd.merge(df, df_sentiment, on='GKGRECORDID', how='left')
df[['llm_sentiment', 'Tone']].corr()

Unnamed: 0,llm_sentiment,Tone
llm_sentiment,1.0,0.489605
Tone,0.489605,1.0
