# Preperation

In [1]:
!pip install transformers
!gdown --folder "1cszZtjGiWoS5kJEU3cF-VZaXDYMiV7KR"

Retrieving folder list
Processing file 1ZzikX7CWHoFFHMZKEclZSuusKMy6JgfD bias-filtered-reduced.csv
Processing file 1kPk4UsXsTxHkAjnklxbbLsSQqM2ZikqP bias-filtered.csv
Processing file 1OIKLoXWjU2PX7-53E6bwcZjzlzB8I3u7 bias-neutrality.csv
Processing file 1VZOwfmloJxDUXTZgk_uzt_fm5U56QPCa netr-filtered-reduced.csv
Processing file 1VuiJdcgtDInrlKDdqGABoIR4Luib6LJc netr-filtered.csv
Processing file 1MeRfyhEXB1BhW4JK_VPTrcqWEKuGnknC subjectivity-MPQA-All-News.csv
Processing file 1egTv57Du45zPTS17DklrJgisRoG6Uwe2 subjectivity-MPQA-All.csv
Processing file 1XZ4ulTvgN3yxWuEZG8BqTrhJyh2c62bT subjectivity-Rotten IMDB.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1ZzikX7CWHoFFHMZKEclZSuusKMy6JgfD
To: /content/Clean Dataset/bias-filtered-reduced.csv
100% 230k/230k [00:00<00:00, 78.1MB/s]
Downloading...
From: https://drive.google.com/uc?id=1kPk4UsXsTxHkAjnklxbbLsSQqM2ZikqP
To: /content/Clean

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from transformers import BertTokenizer, TFBertModel, TFRobertaModel, RobertaTokenizer
import tensorflow as tf
import keras
from keras import layers, metrics
import pandas as pd
import numpy as np
import math
import matplotlib as plt
from google.colab import files

# Load Dataset

## Function & Global Variables

In [4]:
TRAIN_SIZE = 0.6
VAL_SIZE = 0.2
TEST_SIZE = 0.2
SUBJ_BATCH_SIZE = 1
BIAS_BATCH_SIZE = 16
NETR_BATCH_SIZE = 16

bert_base = 'indolem/indobert-base-uncased'
sentence_bert_base = 'firqaaa/indo-sentence-bert-base'
tokenizer = BertTokenizer.from_pretrained(bert_base)

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/234k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.01k [00:00<?, ?B/s]

In [5]:
def encode(text, max_len):
    encoded_dict = tokenizer(text, add_special_tokens = True, max_length = max_len,
                             padding='max_length', return_attention_mask = True,
                             truncation=True, return_tensors = 'tf')

    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return (input_ids, attention_masks)

In [6]:
def split_input_labels(features, labels):
    return {'input_ids': features[0], 'attention_mask': features[1]}, labels

In [7]:
def split_dataset(dataset):
  train_val_dataset, test_dataset = tf.keras.utils.split_dataset(
    dataset, left_size=TRAIN_SIZE+VAL_SIZE, right_size=TEST_SIZE, shuffle=True, seed=42
  )

  train_dataset, val_dataset = tf.keras.utils.split_dataset(
      train_val_dataset, left_size=TRAIN_SIZE, right_size=VAL_SIZE, shuffle=True, seed=42
  )

  return (train_dataset.map(split_input_labels),
          val_dataset.map(split_input_labels),
          test_dataset.map(split_input_labels))

In [8]:
def dataset_preperation(text, labels, max_len, batch_size):
  features_tensor = encode(text.values.tolist(), max_len)
  labels_tensor = tf.constant(labels, dtype=tf.int32)
  dataset = tf.data.Dataset.from_tensor_slices((features_tensor, labels_tensor))

  train, val, test = split_dataset(dataset)

  return (train.batch(batch_size),
          val.batch(batch_size),
          test.batch(batch_size))

## Subjectivity

In [9]:
df_subjectivity = pd.read_csv('/content/Clean Dataset/subjectivity-MPQA-All-News.csv')
features = df_subjectivity['content']
max_len_subjectivity = 2048
labels = df_subjectivity['is_subjective']

In [10]:
train_subjectivity, val_subjectivity, test_subjectivity = dataset_preperation(features, labels, max_len_subjectivity, SUBJ_BATCH_SIZE)

## Bias

In [11]:
df_bias = pd.read_csv('Clean Dataset/bias-filtered.csv')
features = df_bias['text']
max_len_bias = 256
labels = df_bias['is_biased']

In [12]:
train_bias, val_bias, test_bias = dataset_preperation(features, labels, max_len_bias, BIAS_BATCH_SIZE)

## Neutrality

In [13]:
df_neutrality = pd.read_csv('Clean Dataset/netr-filtered.csv')
features = df_neutrality['text']
max_len_neutrality = 256
labels = df_neutrality[['is_left', 'is_center', 'is_right']]

In [14]:
train_neutrality, val_neutrality, test_neutrality = dataset_preperation(features, labels, max_len_neutrality, NETR_BATCH_SIZE)

# Modeling

## Subjectivity

In [None]:
def create_subjectivity_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Dense(256, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  optimizer = tf.keras.optimizers.Adam(epsilon=1)

  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

  return model

In [None]:
SUBJ_EPOCHS = 10
SUBJ_PATIENCE = 3
SUBJ_VERSION = 1

In [None]:
subjectivity_model = create_subjectivity_model(max_len_subjectivity)
subjectivity_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 2048)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 2048)]               0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 2048,                                      

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=SUBJ_PATIENCE,
                                                  restore_best_weights=True)

subjectivity_model.fit(train_subjectivity, epochs=SUBJ_EPOCHS, validation_data = val_subjectivity,
                       callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


<keras.src.callbacks.History at 0x790f4655afe0>

In [None]:
subjectivity_model.save(f'/content/drive/MyDrive/Capstone/Model/Subjectivity/SV{SUBJ_VERSION}')

In [None]:
subjectivity_model.predict(test_subjectivity)



array([[0.31855616],
       [0.2903657 ],
       [0.36654255],
       [0.30079728],
       [0.34997845],
       [0.29499784],
       [0.3222464 ],
       [0.31842774],
       [0.28660727],
       [0.40144923],
       [0.29611614],
       [0.30407473],
       [0.30407473],
       [0.4438432 ],
       [0.42791292],
       [0.29853344],
       [0.34909645],
       [0.41414958],
       [0.3259278 ],
       [0.40534478],
       [0.2562536 ],
       [0.39036724],
       [0.3066061 ],
       [0.44139937],
       [0.28023285],
       [0.5213559 ],
       [0.4118632 ],
       [0.4345877 ],
       [0.2697495 ],
       [0.3918496 ],
       [0.38712373],
       [0.32888538],
       [0.29503125],
       [0.34603542],
       [0.3043619 ],
       [0.30412477],
       [0.35236812],
       [0.31367445],
       [0.3043619 ],
       [0.32854506],
       [0.30905625],
       [0.30839154],
       [0.33698857],
       [0.3251465 ],
       [0.3556957 ],
       [0.36404327],
       [0.36504796],
       [0.303

In [None]:
subjectivity_model.evaluate(test_subjectivity)



[0.617942214012146, 0.6462264060974121]

## Bias

In [29]:
def create_bias_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Dense(256, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

  return model

In [30]:
BIAS_EPOCHS = 10
BIAS_PATIENCE = 3
BIAS_VERSION = 1

In [31]:
bias_model = create_bias_model(max_len_bias)
bias_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                     

In [32]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=BIAS_PATIENCE,
                                                      restore_best_weights=True)
bias_model.fit(train_bias, epochs=BIAS_EPOCHS, validation_data = val_bias,
                       callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd9f45547f0>

In [33]:
bias_model.save(f'/content/drive/MyDrive/Capstone/Model/Bias/BV{BIAS_VERSION}')

In [34]:
bias_model.predict(test_bias)



array([[0.8908661 ],
       [0.12256102],
       [0.32424453],
       [0.20777075],
       [0.69645315],
       [0.44803062],
       [0.92570794],
       [0.9520474 ],
       [0.3835133 ],
       [0.8744836 ],
       [0.13153808],
       [0.24574172],
       [0.6816459 ],
       [0.9034042 ],
       [0.85513645],
       [0.8267587 ],
       [0.78159213],
       [0.305634  ],
       [0.7204285 ],
       [0.18764143],
       [0.07929206],
       [0.15461597],
       [0.1295766 ],
       [0.07907502],
       [0.09996358],
       [0.08622323],
       [0.11408448],
       [0.8868539 ],
       [0.1462071 ],
       [0.5147197 ],
       [0.94697344],
       [0.08437754],
       [0.39240244],
       [0.77824163],
       [0.12817374],
       [0.9034042 ],
       [0.9143661 ],
       [0.55915236],
       [0.71685064],
       [0.93765706],
       [0.23199877],
       [0.33797377],
       [0.49714872],
       [0.56076133],
       [0.12788354],
       [0.940671  ],
       [0.6999482 ],
       [0.666

In [35]:
bias_model.evaluate(test_bias)



[0.5304142236709595, 0.75]

## Neutrality

In [22]:
def create_neutrality_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)
  classifier = keras.Sequential([
        layers.Dense(32, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(3, activation='softmax')
  ])(bert.pooler_output)

  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)

  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

  return model

In [23]:
NETR_EPOCHS = 10
NETR_PATIENCE = 3
NETR_VERSION = 1

In [24]:
neutrality_model = create_neutrality_model(max_len_neutrality)
neutrality_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
                             ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 256, 7                                     

In [25]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=NETR_PATIENCE,
                                                      restore_best_weights=True)
neutrality_model.fit(train_neutrality, epochs=NETR_EPOCHS, validation_data = val_neutrality,
                       callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7fd9f4aaec20>

In [26]:
neutrality_model.save(f'/content/drive/MyDrive/Capstone/Model/Neutrality/NV{NETR_VERSION}')

In [27]:
neutrality_model.predict(test_neutrality)



array([[0.5676932 , 0.00590714, 0.42639962],
       [0.0286252 , 0.00247293, 0.9689019 ],
       [0.11495032, 0.8634309 , 0.02161878],
       ...,
       [0.38594294, 0.46870086, 0.14535625],
       [0.7080511 , 0.00697089, 0.28497803],
       [0.01442005, 0.01192094, 0.97365904]], dtype=float32)

In [28]:
neutrality_model.evaluate(test_neutrality)



[0.4602760374546051, 0.6979655623435974]