# Imports

In [None]:
!pip install -q transformers

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import glob
import re
import ast

In [None]:
df = pd.read_excel('/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Twitter_Round2_Annotator_4.xlsx')

# Sentiment scores for all annotated files
Each annotated dataset from the main file had to be analysed separately.

In [None]:
tokenizer = AutoTokenizer.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')
model = AutoModelForSequenceClassification.from_pretrained('cardiffnlp/twitter-roberta-base-sentiment-latest')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you e

In [None]:
def split_text_into_chunks(text, tokenizer, max_length=512):
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = [tokens[i : i + max_length] for i in range(0, len(tokens), max_length)]
    return [tokenizer.decode(chunk, skip_special_tokens=True) for chunk in chunks]

In [None]:
def analyze_sentiment_score(text):
    chunks = split_text_into_chunks(text, tokenizer)
    all_probs = []
    for chunk in chunks:
        inputs = tokenizer(chunk, return_tensors='pt', truncation=True, max_length=512)
        outputs = model(**inputs)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1).tolist()[0]
        all_probs.append(probs)
    return all_probs

In [None]:
df['sentiment_scores'] = df['Text'].astype(str).apply(analyze_sentiment_score)

df.to_excel('/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Twitter_Round2_Annotator_4_sentiment.xlsx', index=False)

## Merging into 1 file with sentiment scores

In [None]:
# Mapping from annotator number to name
annotator_map = {
    '1': 'BS',
    '2': 'FS',
    '3': 'IR',
    '4': 'WL'
}

In [None]:
# Finding all sentiment files for Round2 Annotators 1–4 on Reddit & Twitter
file_pattern = '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/*_Round2_Annotator_*sentiment*.xlsx'
files = glob.glob(file_pattern)
print(f"Found {len(files)} files:", files)

Found 8 files: ['/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Reddit_Round2_Annotator_1_sentiment.xlsx', '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Reddit_Round2_Annotator_2_sentiment.xlsx', '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Reddit_Round2_Annotator_3_sentiment.xlsx', '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Reddit_Round2_Annotator_4_sentiment.xlsx', '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Twitter_Round2_Annotator_1_sentiment.xlsx', '/content/drive/MyDrive/MA thesis/Data collection/Data for 2nd round/Diveded by annotator/Final files for annotators/Twitter_Round2_Annotator_2_sentiment.xlsx', '/content/drive/MyDriv

In [None]:
# 3. Loading files
dfs = []
for fp in files:
    df = pd.read_excel(fp)
    df['row'] = df.index + 2

    m = re.search(r'Annotator_?(\d)', fp)
    num = m.group(1)
    df['annotator_name'] = annotator_map[num]

    df['platform'] = fp.split('_Round2')[0].split('/')[-1]

    dfs.append(df)

In [None]:
# 4. Concatenating all into one DataFrame
merged = pd.concat(dfs, ignore_index=True)

In [None]:
merged

Unnamed: 0,title,post_id_x,comment_id,comment_author,text,affect_h,availability_h,representativeness_h,confirmation_b,all_or_nothing,sentiment_scores,row,annotator_name,platform,Conversation ID,Tweet ID,Author ID,Created At,Text
0,,1asc8ph,kqs4l9c,fiaanaut,About that 30 year trend....\n\n[Scientific Co...,,,,,,"[[0.22058026492595673, 0.7385219931602478, 0.0...",2,BS,Reddit,,,,,
1,,1asc8ph,kqwpku1,NewyBluey,Then the latitude adjacent ones will be equal ...,,,,,,"[[0.011720871552824974, 0.8318186402320862, 0....",3,BS,Reddit,,,,,
2,,1asc8ph,kqwch1a,unsquashable74,I'm a terrible slinker...\n\n\nWhen did I ever...,,,,,,"[[0.8873007893562317, 0.10176332294940948, 0.0...",4,BS,Reddit,,,,,
3,,1asc8ph,kqwecbe,fiaanaut,That which is presented without evidence can b...,,,,,,"[[0.7376958727836609, 0.24709975719451904, 0.0...",5,BS,Reddit,,,,,
4,,1asc8ph,kr2men7,Molire,"Now you've busted yourself twice, LOL, somewha...",,,,,,"[[0.8078227043151855, 0.17769768834114075, 0.0...",6,BS,Reddit,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39983,,,,,,,,,,,"[[0.006789323873817921, 0.2374216765165329, 0....",2714,WL,Twitter,'1783568557294473372,'1783569436349251763,'757704596747853826,2024-04-25 18:51:05,"@CryptoWizardd Talking bout Ai token, don’t mi..."
39984,,,,,,,,,,,"[[0.0032622581347823143, 0.034042902290821075,...",2715,WL,Twitter,'1783812604252823721,'1783812604252823721,'1256632208783278085,2024-04-26 10:57:21,I'm farming $BEYOND because I want that juicy ...
39985,,,,,,,,,,,"[[0.004185492172837257, 0.046283334493637085, ...",2716,WL,Twitter,'1783812503941906442,'1783812503941906442,'1256632208783278085,2024-04-26 10:56:57,I'm farming $BEYOND because I want that juicy ...
39986,,,,,,,,,,,"[[0.03261750936508179, 0.8758566975593567, 0.0...",2717,WL,Twitter,'1783764727526457619,'1783764727526457619,'1422776426445553665,2024-04-26 07:47:06,Connection of human woman and artificial intel...


In [None]:
out_path = '/content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx'
merged.to_excel(out_path, index=False)
print(f" Done! Merged DataFrame has {len(merged)} rows and is saved to {out_path}")

✅ Done! Merged DataFrame has 39988 rows and is saved to /content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx


## Dividing sentiment scores

In [None]:
merged = pd.read_excel('/content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx')

In [None]:
def first_chunk_scores(lst):
    scores = lst[0] if isinstance(lst, list) and len(lst)>0 else [0,0,0]
    return scores

In [None]:
merged['sentiment_scores'] = merged['sentiment_scores'].apply(ast.literal_eval)

In [None]:
scores_df = pd.DataFrame(
    merged['sentiment_scores']
          .apply(first_chunk_scores)
          .tolist(),
    columns=['neg','neu','pos'],
    index=merged.index
).round(6)

merged[['neg','neu','pos']] = scores_df

In [None]:
idx = 0
print( merged.loc[idx, ['text','sentiment_scores','neg','neu','pos']] )

text                About that 30 year trend....\n\n[Scientific Co...
sentiment_scores    [[0.22058026492595673, 0.7385219931602478, 0.0...
neg                                                           0.22058
neu                                                          0.738522
pos                                                          0.040898
Name: 0, dtype: object


In [None]:
# Overwriting the same file
merged.to_excel('/content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx', index=False)
print(f"✅ Done! Overwrote {'/content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx'}")

✅ Done! Overwrote /content/drive/MyDrive/MA thesis/Results/Iteration 2/merged_Round2_all_sentiment.xlsx
