In [None]:
from huggingface_hub import login
from datasets import load_dataset
import os

login(os.getenv("hf_token"))

train = load_dataset("dragonslayer631/ci2_allsides", split="train")
test = load_dataset("dragonslayer631/ci2_allsides", split="test")

train = train.to_pandas()
test = test.to_pandas()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split

In [38]:
joined = pd.concat([train, test], axis=0)

joined.shape

(45089, 47)

In [39]:
joined.columns

Index(['topic', 'tags', 'text', 'int_bias', 'summary_5', 'summary_50',
       'summary_100', 'text_entity_sentiments', 'text_topic_to_sentiment',
       'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment',
       'summary_50_entity_sentiments', 'summary_50_topic_to_sentiment', 'id',
       'text_encoded', 'summary_100_encoded', 'summary_50_encoded',
       'text_topic_0', 'text_sentiment_0', 'text_topic_1', 'text_sentiment_1',
       'text_topic_2', 'text_sentiment_2', 'text_topic_3', 'text_sentiment_3',
       'text_topic_4', 'text_sentiment_4', 'summary_100_topic_0',
       'summary_100_sentiment_0', 'summary_100_topic_1',
       'summary_100_sentiment_1', 'summary_100_topic_2',
       'summary_100_sentiment_2', 'summary_100_topic_3',
       'summary_100_sentiment_3', 'summary_100_topic_4',
       'summary_100_sentiment_4', 'summary_50_topic_0',
       'summary_50_sentiment_0', 'summary_50_topic_1',
       'summary_50_sentiment_1', 'summary_50_topic_2',
       'summary_

In [40]:
joined = joined.drop([
    'topic', 'tags', 'summary_100', 'summary_5', 'summary_100', 'summary_100', 'summary_100_entity_sentiments', 'summary_100_topic_to_sentiment','summary_100_entity_sentiments', 'summary_100_topic_to_sentiment','text_entity_sentiments', 'text_topic_to_sentiment', 'id',
    'text_encoded', 'summary_100_encoded',
    'summary_100_topic_0', 'summary_100_sentiment_0', 
    'summary_100_topic_1',
   'summary_100_sentiment_1', 'summary_100_topic_2',
   'summary_100_sentiment_2', 'summary_100_topic_3',
   'summary_100_sentiment_3', 'summary_100_topic_4',
   'summary_100_sentiment_4', 'text_topic_0',
   'text_sentiment_0', 'text_topic_1',
   'text_sentiment_1', 'text_topic_2',
   'text_sentiment_2', 'text_topic_3',
   'text_sentiment_3', 'text_topic_4',
   'text_sentiment_4', 
], axis=1)

In [41]:
joined.columns

Index(['text', 'int_bias', 'summary_50', 'summary_50_entity_sentiments',
       'summary_50_topic_to_sentiment', 'summary_50_encoded',
       'summary_50_topic_0', 'summary_50_sentiment_0', 'summary_50_topic_1',
       'summary_50_sentiment_1', 'summary_50_topic_2',
       'summary_50_sentiment_2', 'summary_50_topic_3',
       'summary_50_sentiment_3', 'summary_50_topic_4',
       'summary_50_sentiment_4'],
      dtype='object')

In [42]:
import pandas as pd

def multi_column_one_hot_2(df, topic_columns, sentiment_columns) -> pd.DataFrame:
    """
    One-hot encode multiple categorical topic columns and their associated sentiment values.
    
    Returns a single DataFrame with:
    - One-hot vectors for each topic (prefix 'topic ')
    - Sentiment-weighted vectors (prefix 'sentiment ')
    """
    # Stack topics and sentiments into long format
    topic_long = df[topic_columns].copy()
    sentiment_long = df[sentiment_columns].copy()

    topic_long.columns = range(len(topic_columns))  # avoid duplicate column names
    sentiment_long.columns = range(len(sentiment_columns))

    topic_series = topic_long.stack()
    sentiment_series = sentiment_long.stack()

    # Align manually by index
    combined = pd.DataFrame({
        'topic': topic_series.values,
        'sentiment': sentiment_series.values
    }, index=topic_series.index)

    # One-hot encode
    one_hot = pd.get_dummies(combined['topic'])
    weighted = one_hot.mul(combined['sentiment'], axis=0)

    # Aggregate back to row-level (level 0)
    one_hot_topics = one_hot.groupby(level=0).max().add_prefix('topic ')
    one_hot_sentiments = weighted.groupby(level=0).sum().add_prefix('sentiment ')

    # Final concatenation
    res = pd.concat([one_hot_topics, one_hot_sentiments], axis=1)
    final = pd.concat([
    df.drop(columns=topic_columns + sentiment_columns).reset_index(drop=True),
    res.reset_index(drop=True)
], axis=1)

    return final


In [44]:
joined = multi_column_one_hot_2(joined, topic_columns=['summary_50_topic_0', 'summary_50_topic_1', 'summary_50_topic_2', 'summary_50_topic_3', 'summary_50_topic_4'], sentiment_columns=['summary_50_sentiment_0', 'summary_50_sentiment_1', 'summary_50_sentiment_2', 'summary_50_sentiment_3', 'summary_50_sentiment_4'])

In [45]:
joined.shape

(45089, 4974)

In [46]:
summary_50_encoding_df = pd.DataFrame(joined['summary_50_encoded'].tolist())
joined = pd.concat([joined.drop('summary_50_encoded', axis=1), summary_50_encoding_df], axis=1)

In [58]:
# split back into test and train

train_idx = train.index
test_idx = test.index

X_train = joined.loc[train_idx]
Y_train = X_train[['int_bias']]
X_train.drop(["int_bias"], axis=1, inplace=True)

X_test = joined.loc[test_idx]
Y_test = X_test[['int_bias']]
X_test.drop(["int_bias"], axis=1, inplace=True)

X_train = X_train.convert_dtypes()
X_test = X_test.convert_dtypes()
Y_test = Y_test.convert_dtypes()
Y_train = Y_train.convert_dtypes()



print(X_train.shape, Y_train.shape, X_test.shape, Y_test.shape)

(36071, 5736) (36071, 1) (9018, 5736) (9018, 1)


In [59]:
from xgboost import XGBClassifier

model = XGBClassifier(
    objective='multi:softmax',  # outputs class probabilities
    num_class=3,
    max_depth=8,                # Avoid extreme depth like 128
    tree_method='hist',     # Use GPU if available
    predictor='gpu_predictor',  # Use GPU for inference
    use_label_encoder=False,    # suppress warning
    eval_metric='mlogloss',
    device=device
)

model.fit(X_train, Y_train)

# # Predict probabilities for each class
# probs = model.predict_proba(X_test_updated)

# Predict class labels
preds = model.predict(X_test)

print(classification_report(Y_test, preds, digits=4))

Parameters: { "predictor", "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


              precision    recall  f1-score   support

         0.0     1.0000    1.0000    1.0000      3677
         1.0     1.0000    1.0000    1.0000      2208
         2.0     1.0000    1.0000    1.0000      3133

    accuracy                         1.0000      9018
   macro avg     1.0000    1.0000    1.0000      9018
weighted avg     1.0000    1.0000    1.0000      9018



In [60]:
model.save_model("xgb_summary_50.json")  # or .bin