In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from utils.ds_utils import load_dataset_from_huggingface, login_huggingface
import os

login_huggingface(os.getenv("hf_token"))

ds = load_dataset_from_huggingface()

Logged in to Hugging Face Hub


Generating train split: 100%|██████████| 45091/45091 [00:02<00:00, 20043.24 examples/s]

Loaded dataset: dragonslayer631/ci2_allsides, split: train





In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [3]:
from sentence_transformers import SentenceTransformer

# 1. Load a pretrained Sentence Transformer model
model = SentenceTransformer("all-mpnet-base-v2")

model = model.to(device)

In [37]:
max_size = model.get_max_seq_length() - 2
max_size

382

In [53]:
import time

def embed_long_text(text, model, chunk_size, device):
    tokenizer = model.tokenizer
    tokens = tokenizer.tokenize(text)
    chunks = [" ".join(tokens[i:i+chunk_size]) for i in range(0, len(tokens), chunk_size)]
    
    chunk_embeddings = model.encode(
        chunks,
        batch_size=32,
        device=device,
        convert_to_tensor=False,
    )
    
    return sum(chunk_embeddings) / len(chunk_embeddings)

def encode_text(batch):
    text_encoded = [
        embed_long_text(x, model=model, chunk_size=max_size, device=device)
        for x in batch['text']
    ]

    summary_100_encoded = model.encode(
        batch['summary_100'], batch_size=64, device=device, convert_to_tensor=False
    )
    summary_50_encoded = model.encode(
        batch['summary_50'], batch_size=64, device=device, convert_to_tensor=False
    )

    return {
        "text_encoded": text_encoded,
        "summary_100_encoded": summary_100_encoded,
        "summary_50_encoded": summary_50_encoded
    }

In [54]:
mapped = ds.map(
    encode_text, 
    batched=True,
    batch_size=64,
)

Map: 100%|██████████| 45091/45091 [2:53:31<00:00,  4.33 examples/s]  


In [55]:
from utils.ds_utils import save_dataset_to_huggingface

save_dataset_to_huggingface(mapped)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]
[Aating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]
[Aating parquet from Arrow format:   4%|▍         | 1/23 [00:00<00:05,  3.73ba/s]
[Aating parquet from Arrow format:   9%|▊         | 2/23 [00:00<00:05,  3.96ba/s]
[Aating parquet from Arrow format:  13%|█▎        | 3/23 [00:00<00:04,  4.32ba/s]
[Aating parquet from Arrow format:  17%|█▋        | 4/23 [00:00<00:03,  4.80ba/s]
[Aating parquet from Arrow format:  22%|██▏       | 5/23 [00:01<00:03,  4.79ba/s]
[Aating parquet from Arrow format:  26%|██▌       | 6/23 [00:01<00:03,  4.66ba/s]
[Aating parquet from Arrow format:  30%|███       | 7/23 [00:01<00:03,  4.83ba/s]
[Aating parquet from Arrow format:  35%|███▍      | 8/23 [00:01<00:03,  5.00ba/s]
[Aating parquet from Arrow format:  39%|███▉      | 9/23 [00:01<00:02,  4.91ba/s]
[Aating parquet from Arrow format:  43%|████▎     | 10/23 [00:02<00:02,  4.97ba/s]
[Aating parquet from Arro

Saved dataset: dragonslayer631/ci2_allsides, split: train


In [56]:
ds

Dataset({
    features: ['topic', 'tags', 'text', 'int_bias', 'summary_5', 'summary_50', 'summary_100', 'text_entity_sentiments', 'text_topic_to_sentiment', 'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment', 'summary_50_entity_sentiments', 'summary_50_topic_to_sentiment', 'id'],
    num_rows: 45091
})

In [103]:
from collections import Counter
import json

def get_top_topics_sentiments(dcts: list[dict], top_k: int) -> tuple:        
    batch_keys = []
    batch_values = []
    for dct in dcts:
        
        dct = json.loads(dct)
        sorted_items = sorted(dct.items(), key=lambda item: abs(item[1]), reverse=True)
        sorted_dict = dict(sorted_items)
        keys, values = list(sorted_dict.keys()), list(sorted_dict.values())

        if len(keys) > top_k:
            keys = keys[:top_k]
            values = values[:top_k]

        if len(keys) < top_k:
            extend = [None] * (top_k - len(keys))
            keys.extend(extend)
            values.extend(extend)

        batch_keys.append(keys)
        batch_values.append(values)
        
    return batch_keys, batch_values

def categorize_topics_sentiments(batch):
    topic_sentiments = batch['summary_100_topic_to_sentiment']
    top_k_keys, top_k_values = get_top_topics_sentiments(topic_sentiments, 5)
    return {
        "summary_100_topic_0" : [k[0] for k in top_k_keys],
        "summary_100_sentiment_0" : [v[0] for v in top_k_values],
        "summary_100_topic_1" : [k[1] for k in top_k_keys],
        "summary_100_sentiment_1" : [v[1] for v in top_k_values],
        "summary_100_topic_2" : [k[2] for k in top_k_keys],
        "summary_100_sentiment_2" : [v[2] for v in top_k_values],
        "summary_100_topic_3" : [k[3] for k in top_k_keys],
        "summary_100_sentiment_3" : [v[3] for v in top_k_values],
        "summary_100_topic_4" : [k[4] for k in top_k_keys],
        "summary_100_sentiment_4" : [v[4] for v in top_k_values]
    }

In [84]:
sample = ds.select(range(100))

In [85]:
sampled = sample.map(
    categorize_topics_sentiments, 
    batched=True,
    batch_size=64,
)

Map: 100%|██████████| 100/100 [00:00<00:00, 7316.33 examples/s]


In [86]:
sampled

Dataset({
    features: ['topic', 'tags', 'text', 'int_bias', 'summary_5', 'summary_50', 'summary_100', 'text_entity_sentiments', 'text_topic_to_sentiment', 'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment', 'summary_50_entity_sentiments', 'summary_50_topic_to_sentiment', 'id', 'text_topic_0', 'text_sentiment_0', 'text_topic_1', 'text_sentiment_1', 'text_topic_2', 'text_sentiment_2', 'text_topic_3', 'text_sentiment_3', 'text_topic_4', 'text_sentiment_4'],
    num_rows: 100
})

In [93]:
df = mapped.to_pandas()

df

Unnamed: 0,topic,tags,text,int_bias,summary_5,summary_50,summary_100,text_entity_sentiments,text_topic_to_sentiment,summary_100_entity_sentiments,summary_100_topic_to_sentiment,summary_50_entity_sentiments,summary_50_topic_to_sentiment,id,text_encoded,summary_100_encoded,summary_50_encoded
0,economy_and_jobs,"Economy And Jobs, Economic Policy, Budget Reco...",We use cookies and similar methods to recogniz...,0,Infrastructure deal could transform America.,This week is pivotal for Biden's $1.2 trillion...,This week marks a crucial juncture for Senate ...,"{""President Biden"": 0.2, ""The Senate"": 0.1, ""D...","{""Economy And Jobs"": 0, ""Economic Policy"": 0, ...","{""Biden"": 0.6666666666666666, ""America"": 0.5, ...","{""Economy And Jobs"": 0, ""Economic Policy"": 0, ...","{""Axios"": 0.2, ""Sarah Grillo"": 0, ""President B...",,0002a7129f586dc1,"[-0.028916225, 0.10918158, -0.0073403805, 0.00...","[-0.041866116, 0.17918587, -0.022695199, 0.038...","[-0.055015147, 0.18244913, -0.031295512, 0.000..."
1,fiscal_cliff,"Fiscal Cliff, Banking And Finance, Economy And...",Congress careened over the edge of the fiscal ...,2,Senate passes fiscal cliff deal.,"As Congress approached the fiscal cliff, a bip...","At midnight, Congress teetered on the fiscal c...","{""Obama"": 0.0, ""White House"": 0.0, ""Democrats""...","{""Fiscal Cliff"": -0.10000000149011612, ""Bankin...","{""Congress"": 0.0, ""fiscal cliff"": -0.100000001...","{""Fiscal Cliff"": 0.20000000298023224, ""Banking...","{""Congress"": 0.1, ""Senate"": 0.2, ""House"": -0.1...","{""Economic Policy"": 0.0, ""Fiscal Cliff"": 0, ""B...",0005c1b511659a98,"[-0.051092006, 0.08444972, 0.0005916564, -0.00...","[-0.05243456, 0.059714366, -0.0005695005, -0.0...","[-0.03622693, 0.04714141, 0.017436901, -0.0024..."
2,immigration,Immigration,Even before the Trump administration announced...,0,Immigrants fear future under Trump.,"In Trump’s America, immigrants face unpreceden...",The Trump administration's immigration policie...,"{""Aurea Galvan"": -0.7, ""Trump"": -0.9, ""Seydi S...","{""Immigration"": -0.5}","{""Trump administration"": -0.7, ""Aurea Galvan"":...","{""Immigration"": -0.7}","{""Aurea Galvan"": -0.5, ""Yasmin"": -0.7, ""Seydi ...",,0006fd160e0839d0,"[-0.046644386, 0.07658547, -0.011063736, 0.001...","[-0.051978186, 0.05757335, -0.017256187, -0.02...","[-0.059706487, 0.0722755, -0.0048332238, -0.01..."
3,immigration,"Immigration, Crime, Border Crisis, ICE, Migran...",The Trump campaign and allies of the former pr...,0,Trump distorts immigrant crime statistics.,The Trump campaign misuses old Homeland Securi...,The Trump campaign is misleadingly using Homel...,"{""administration"": 0.0, ""Donald Trump"": -0.100...","{""Immigration"": -0.4000000059604645, ""Crime"": ...","{""campaign"": 0.0, ""thousands"": 0.0, ""Trump"": 0...","{""Immigration"": 0, ""Crime"": 0, ""Border Crisis""...","{""Donald Trump"": -0.8, ""Kamala Harris"": 0.6, ""...",,0007f8a47be4b6ee,"[-0.039676305, 0.08279843, -0.024409575, 0.044...","[-0.048077773, 0.11102984, -0.027754998, 0.043...","[-0.07323033, 0.11256627, -0.029350888, 0.0628..."
4,elections,"Election 2020, Donald Trump, Presidential Elec...",When President Trump speaks in front of some 2...,2,Trump's re-election campaign launches tonight.,As President Trump officially launches his 202...,As President Trump officially kicks off his 20...,"{""Trump"": 0.0, ""campaign"": 0.0, ""re-election c...","{""Election2020"": 0, ""Donald Trump"": 0.79999998...","{""Donald Trump"": 0.2, ""Joe Biden"": 0.3, ""Repub...","{""Election2020"": 0, ""Donald Trump"": 0.2, ""Pres...","{""Donald Trump"": 0.2, ""RNC"": 0.3, ""Joe Biden"":...",,00083375b852e7ac,"[-0.028865773, 0.09790309, 0.004893719, 0.0025...","[-0.002960503, 0.13613103, 0.0022070042, -0.01...","[-0.017735977, 0.115911745, 0.0124983005, -0.0..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45086,immigration,"Immigration, Mitch McConnell",Senate Republican Leader Mitch McConnell said ...,2,Immigration reform lacks bipartisan consensus.,Senate Republican Leader Mitch McConnell state...,Senate Republican Leader Mitch McConnell asser...,"{""Mitch McConnell"": -0.5, ""Matt Bevin"": -0.7, ...","{""Immigration"": 0, ""Mitch McConnell"": -0.5}","{""Mitch McConnell"": -0.6, ""Matt Bevin"": -0.8, ...","{""Immigration"": 0, ""Mitch McConnell"": -0.6}","{""Mitch McConnell"": -0.6, ""Matt Bevin"": -0.8, ...",,fffd6f68b5bcec3a,"[-0.0409906, 0.09073715, -0.019516913, -0.0134...","[-0.037168715, 0.09610114, -0.01898526, -0.022...","[-0.058798987, 0.1086956, -0.020934915, -0.015..."
45087,science,"Science, NASA, Space Exploration",Sign up for CNN’s Wonder Theory science newsle...,0,NASA's Artemis I launch postponed again.,NASA’s Artemis I moon mission launch was postp...,NASA's Artemis I moon mission launch has been ...,"{""time"": 0.0, ""Florida.NASA"": 0.0, ""Mike Saraf...","{""Science"": 0.10000000149011612, ""NASA"": 0.200...","{""mission launch"": 0.0, ""engine issue"": 0.0, ""...","{""Science"": 0.10000000149011612, ""NASA"": 0, ""S...","{""NASA"": 0.5, ""Mike Sarafin"": 0.2, ""Kamala Har...","{""Science"": 0.5, ""NASA"": 0.5, ""Space Explorati...",fffe2ebca1999ce4,"[0.014063582, 0.007860853, 0.0025481053, -0.00...","[0.02583513, 0.059887465, -0.03318025, -0.0011...","[0.011474235, 0.046107873, -0.02424741, 0.0018..."
45088,coronavirus,"Role Of Government, Healthcare, Public Health,...",President Donald Trump speaks about the corona...,1,Trump outlines state-specific reopening guidel...,"On April 15, 2020, President Trump discussed e...","On April 15, 2020, President Trump announced t...","{""Donald Trump"": 0.2, ""White House"": 0, ""Washi...","{""Role Of Government"": 0.1, ""Healthcare"": -0.5...","{""President Trump"": 0.2, ""U.S."": 0.5, ""busines...","{""Role Of Government"": 0, ""Healthcare"": 0, ""Pu...","{""Donald Trump"": 0.2, ""Dr. Deborah Birx"": 0.1,...",,fffe5dea333bf896,"[-0.033034544, 0.07097604, -0.009781277, -0.00...","[-0.032852333, 0.077151954, -0.042602945, -0.0...","[-0.0017861387, 0.06939759, -0.014608841, -0.0..."
45089,economy_and_jobs,"Economy And Jobs, Supply Chains, Coronavirus, ...",Quotes displayed in real-time or delayed by at...,2,Inflation and supply chains hinder recovery.,"In today’s tumultuous market landscape, surgin...","In its latest World Economic Outlook, the IMF ...","{""Mike Murphy"": 0.0, ""International Monetary F...","{""Economy And Jobs"": -1.7, ""Supply Chains"": 0,...","{""IMF"": 0.0, ""U.S."": -0.5, ""Federal Reserve"": ...","{""Economy And Jobs"": -0.5, ""GDP"": -0.5, ""Infla...","{""Mike Murphy"": 0.0, ""International Monetary F...",,ffff366708595386,"[-0.02462098, 0.057189632, -0.028621837, -0.02...","[-0.034127224, 0.050960753, -0.045273542, -0.0...","[-0.047483053, 0.059424736, -0.044047564, -0.0..."


In [95]:
df.dropna(subset=["text_topic_to_sentiment", "summary_100_topic_to_sentiment"], inplace=True)

In [100]:
from datasets import Dataset

mapped = Dataset.from_pandas(df, preserve_index=False)

In [102]:
text_split = mapped.map(
    categorize_topics_sentiments, 
    batched=True,
    batch_size=128,
)

Map: 100%|██████████| 45089/45089 [00:02<00:00, 17186.00 examples/s]


In [105]:
text_split

Dataset({
    features: ['topic', 'tags', 'text', 'int_bias', 'summary_5', 'summary_50', 'summary_100', 'text_entity_sentiments', 'text_topic_to_sentiment', 'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment', 'summary_50_entity_sentiments', 'summary_50_topic_to_sentiment', 'id', 'text_encoded', 'summary_100_encoded', 'summary_50_encoded', 'text_topic_0', 'text_sentiment_0', 'text_topic_1', 'text_sentiment_1', 'text_topic_2', 'text_sentiment_2', 'text_topic_3', 'text_sentiment_3', 'text_topic_4', 'text_sentiment_4'],
    num_rows: 45089
})

In [106]:
text_split = text_split.map(
    categorize_topics_sentiments, 
    batched=True,
    batch_size=128,
)

Map: 100%|██████████| 45089/45089 [00:03<00:00, 12531.93 examples/s]


In [107]:
text_split

Dataset({
    features: ['topic', 'tags', 'text', 'int_bias', 'summary_5', 'summary_50', 'summary_100', 'text_entity_sentiments', 'text_topic_to_sentiment', 'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment', 'summary_50_entity_sentiments', 'summary_50_topic_to_sentiment', 'id', 'text_encoded', 'summary_100_encoded', 'summary_50_encoded', 'text_topic_0', 'text_sentiment_0', 'text_topic_1', 'text_sentiment_1', 'text_topic_2', 'text_sentiment_2', 'text_topic_3', 'text_sentiment_3', 'text_topic_4', 'text_sentiment_4', 'summary_100_topic_0', 'summary_100_sentiment_0', 'summary_100_topic_1', 'summary_100_sentiment_1', 'summary_100_topic_2', 'summary_100_sentiment_2', 'summary_100_topic_3', 'summary_100_sentiment_3', 'summary_100_topic_4', 'summary_100_sentiment_4'],
    num_rows: 45089
})

In [108]:
save_dataset_to_huggingface(text_split)

Uploading the dataset shards:   0%|          | 0/2 [00:00<?, ?it/s]
[Aating parquet from Arrow format:   0%|          | 0/23 [00:00<?, ?ba/s]
[Aating parquet from Arrow format:   4%|▍         | 1/23 [00:00<00:05,  3.70ba/s]
[Aating parquet from Arrow format:   9%|▊         | 2/23 [00:00<00:04,  4.26ba/s]
[Aating parquet from Arrow format:  13%|█▎        | 3/23 [00:00<00:04,  4.60ba/s]
[Aating parquet from Arrow format:  17%|█▋        | 4/23 [00:00<00:04,  4.40ba/s]
[Aating parquet from Arrow format:  22%|██▏       | 5/23 [00:01<00:05,  3.27ba/s]
[Aating parquet from Arrow format:  26%|██▌       | 6/23 [00:01<00:04,  3.48ba/s]
[Aating parquet from Arrow format:  30%|███       | 7/23 [00:01<00:03,  4.01ba/s]
[Aating parquet from Arrow format:  35%|███▍      | 8/23 [00:01<00:03,  4.34ba/s]
[Aating parquet from Arrow format:  39%|███▉      | 9/23 [00:02<00:03,  4.58ba/s]
[Aating parquet from Arrow format:  43%|████▎     | 10/23 [00:02<00:02,  4.86ba/s]
[Aating parquet from Arro

Saved dataset: dragonslayer631/ci2_allsides, split: train
