### Initial Implementation

##### 1. Imports

In [1]:
import numpy as np
import pandas as pd
from datasets import load_dataset

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import classification_report, accuracy_score

##### 2. Load Dataset

In [2]:
data = load_dataset("shainar/BEAD", "Full_Annotations", split="full")
print(data)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/7.08k [00:00<?, ?B/s]

Full.csv:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

Generating full split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['text', 'dimension', 'biased_words', 'aspect', 'label', 'sentiment', 'toxic', 'identity_mention'],
    num_rows: 3674927
})


In [3]:
df = data.to_pandas()
print(df.head())

                                                text             dimension  \
0  Where will this leave the kids??? That my frie...  Ageism and sentiment   
1   I had a feeling of impending doom &amp; then ...  Ageism and sentiment   
2                                 I no feel good ..   Ageism and sentiment   
3  Back in Hotlanta for a about a week, then off ...  Ageism and sentiment   
4  Kids toys &amp; stamping stuff cleaned up... N...  Ageism and sentiment   

            biased_words          aspect            label sentiment  toxic  \
0                     []     Transphobia  Slightly Biased  Negative    0.0   
1  ['impending', 'doom']  Climate change    Highly Biased  Negative    1.0   
2                     []    Body-shaming          Neutral  Positive    0.0   
3                     []       Lifestyle  Slightly Biased  Negative    0.0   
4                     []           Hobby          Neutral   Neutral    1.0   

  identity_mention  
0               NO  
1               NO  

##### 3. Text Pre-processing

In [4]:
print(df.isnull().sum())

text                     2
dimension                0
biased_words             0
aspect              177970
label                    0
sentiment                0
toxic                    0
identity_mention         0
dtype: int64


In [5]:
df = df.dropna(subset=['text'])

In [6]:
df = df.drop_duplicates(subset='text')

In [7]:
print("Min text snippet length:", df['text'].astype('string').apply(len).min())
print("Max text snippet length:", df['text'].astype('string').apply(len).max())
print("Avg text snippet length:", df['text'].astype('string').apply(len).mean())

Min text snippet length: 1
Max text snippet length: 37903
Avg text snippet length: 211.07703640651337


In [8]:
text_len = df['text'].str.len()
ind = text_len.idxmin()
row = df.loc[ind]
print(row)

text                           .
dimension               toxicity
biased_words                  []
aspect              Perspective 
label                    Neutral
sentiment                Neutral
toxic                        1.0
identity_mention              NO
Name: 746630, dtype: object


In [9]:
min_len = 5
ind_drop = df[df['text'].str.len() < min_len].index
print(f"Dropped {len(ind_drop)} rows.")
df.drop(ind_drop, inplace=True)

Dropped 825 rows.


In [10]:
print("Min text snippet length:", df['text'].astype('string').apply(len).min())
print("Max text snippet length:", df['text'].astype('string').apply(len).max())
print("Avg text snippet length:", df['text'].astype('string').apply(len).mean())

Min text snippet length: 5
Max text snippet length: 37903
Avg text snippet length: 211.12367156175208


In [11]:
import re

text = df['text']

text = text.apply(lambda x: x.lower())
def edit(text):
	text = re.sub(r'<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});', '', text)
	text = re.sub(r'https?://\S+|www\.\S+', '', text)
	text = re.sub(r'[^a-z0-9\s.,!?]', '', text)
	text = re.sub(r'\s+', ' ', text).strip()
	return text
text = text.apply(edit)

df['text'] = text

In [12]:
min_len = 5
ind_drop = df[df['text'].str.len() < min_len].index
print(f"Dropped {len(ind_drop)} rows.")
df.drop(ind_drop, inplace=True)

Dropped 2637 rows.


In [14]:
df['text'] = df['text'].astype(str)
df['text'] = df['text'].fillna('')

##### 4. Dataset Characteristics

In [13]:
print("Min text snippet length:", df['text'].astype('string').apply(len).min())
print("Max text snippet length:", df['text'].astype('string').apply(len).max())
print("Avg text snippet length:", df['text'].astype('string').apply(len).mean())

Min text snippet length: 5
Max text snippet length: 32191
Avg text snippet length: 203.62309087739558


In [16]:
print(f"{len(df)} rows.")

3671320


In [17]:
print(df['label'].value_counts())

label
Neutral            1882864
Slightly Biased    1041122
Highly Biased       747334
Name: count, dtype: int64


In [18]:
print(df['sentiment'].value_counts())

sentiment
Positive    1678404
Negative    1224348
Neutral      768568
Name: count, dtype: int64


##### 5. One-Hot Encoding

In [19]:
label_encoder = LabelEncoder()
df['label_encoded'] = label_encoder.fit_transform(df['label'])
num_classes = len(label_encoder.classes_)
y = to_categorical(df['label_encoded'], num_classes=num_classes)

##### 6. Tokenisation & Padding

In [20]:
MAX_VOCAB = 10000
MAX_LEN = 100

tokenizer = Tokenizer(num_words=MAX_VOCAB, oov_token="<OOV>")
tokenizer.fit_on_texts(df['text'])

X = tokenizer.texts_to_sequences(df['text'])
X = pad_sequences(X, maxlen=MAX_LEN, padding='post', truncating='post')

##### 7. Train-Test Split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_test, X_valid, y_test, y_valid = train_test_split(X_test, y_test, test_size=0.5, random_state=42, stratify=y_test)

##### 8. Model Architecture

In [22]:
embedding_dim = 128

model = Sequential([
    Embedding(input_dim=MAX_VOCAB, output_dim=embedding_dim, input_length=MAX_LEN),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.5),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(num_classes, activation='softmax')
])

model.build(input_shape=(None, MAX_LEN))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



##### 9. Model Training

In [23]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    epochs=10,
    batch_size=32,
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1217s[0m 13ms/step - accuracy: 0.7956 - loss: 0.5055 - val_accuracy: 0.8520 - val_loss: 0.3831
Epoch 2/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1211s[0m 13ms/step - accuracy: 0.8500 - loss: 0.3964 - val_accuracy: 0.8568 - val_loss: 0.3705
Epoch 3/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1206s[0m 13ms/step - accuracy: 0.8551 - loss: 0.3827 - val_accuracy: 0.8574 - val_loss: 0.3669
Epoch 4/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1232s[0m 13ms/step - accuracy: 0.8586 - loss: 0.3741 - val_accuracy: 0.8586 - val_loss: 0.3646
Epoch 5/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1215s[0m 13ms/step - accuracy: 0.8602 - loss: 0.3700 - val_accuracy: 0.8581 - val_loss: 0.3648
Epoch 6/10
[1m91783/91783[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1211s[0m 13ms/step - accuracy: 0.8617 - loss: 0.3668 - val_accuracy: 0.8580

##### 10. Model Evaluation

In [24]:
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("Accuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:\n", classification_report(y_true, y_pred, target_names=label_encoder.classes_))

[1m11473/11473[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 4ms/step
Accuracy: 0.8588518570977196

Classification Report:
                  precision    recall  f1-score   support

  Highly Biased       0.85      0.78      0.82     74734
        Neutral       0.91      0.94      0.93    188286
Slightly Biased       0.76      0.77      0.77    104112

       accuracy                           0.86    367132
      macro avg       0.84      0.83      0.84    367132
   weighted avg       0.86      0.86      0.86    367132

