# Preperation

In [None]:
!pip install transformers
!pip install sentence-transformers
!gdown --folder "1cszZtjGiWoS5kJEU3cF-VZaXDYMiV7KR"

Retrieving folder list
Processing file 1OIKLoXWjU2PX7-53E6bwcZjzlzB8I3u7 bias-neutrality.csv
Processing file 1MeRfyhEXB1BhW4JK_VPTrcqWEKuGnknC subjectivity-MPQA-All-News.csv
Processing file 1egTv57Du45zPTS17DklrJgisRoG6Uwe2 subjectivity-MPQA-All.csv
Processing file 1HjTFb9-LlJ6QRpgBAt6tl8Io-UqOA_TA subjectivity-MPQA-News.csv
Processing file 1BtvFvMO9Vd7tMIqhpTnaFmV2glKticbk subjectivity-MPQA.csv
Processing file 1XZ4ulTvgN3yxWuEZG8BqTrhJyh2c62bT subjectivity-Rotten IMDB.csv
Retrieving folder list completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1OIKLoXWjU2PX7-53E6bwcZjzlzB8I3u7
To: /content/Clean Dataset/bias-neutrality.csv
100% 788k/788k [00:00<00:00, 7.07MB/s]
Downloading...
From: https://drive.google.com/uc?id=1MeRfyhEXB1BhW4JK_VPTrcqWEKuGnknC
To: /content/Clean Dataset/subjectivity-MPQA-All-News.csv
100% 2.59M/2.59M [00:00<00:00, 16.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1egTv57Du45z

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from transformers import BertTokenizer, TFBertModel, TFRobertaModel, RobertaTokenizer
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance
from sklearn.metrics import silhouette_score

import tensorflow as tf
import keras
from keras import layers, metrics
import pandas as pd
import numpy as np
import math
import matplotlib as plt
from google.colab import files

# Load Dataset

## Function & Global Variables

In [None]:
TRAIN_SIZE = 0.6
VAL_SIZE = 0.2
TEST_SIZE = 0.2
SUBJ_BATCH_SIZE = 4
BIAS_BATCH_SIZE = 4
NETR_BATCH_SIZE = 4
MAX_LEN = 512

bert_base = 'indolem/indobert-base-uncased'
sentence_bert_base = 'firqaaa/indo-sentence-bert-base'
tokenizer = BertTokenizer.from_pretrained(bert_base)
sentence_transformer = SentenceTransformer(sentence_bert_base)

In [None]:
def sentence_encode(text):
    return sentence_transformer(text)

In [None]:
def encode(text):
    encoded_dict = tokenizer(text, add_special_tokens = True, max_length = MAX_LEN,
                             padding='max_length', return_attention_mask = True,
                             truncation=True, return_tensors = 'tf')

    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return (input_ids, attention_masks)

In [None]:
def split_input_labels(features, labels):
    return {'input_ids': features[0], 'attention_mask': features[1]}, labels

In [None]:
def split_dataset(dataset):
  train_val_dataset, test_dataset = tf.keras.utils.split_dataset(
    dataset, left_size=TRAIN_SIZE+VAL_SIZE, right_size=TEST_SIZE, shuffle=True, seed=42
  )

  train_dataset, val_dataset = tf.keras.utils.split_dataset(
      train_val_dataset, left_size=TRAIN_SIZE, right_size=VAL_SIZE, shuffle=True, seed=42
  )

  return (train_dataset.map(split_input_labels),
          val_dataset.map(split_input_labels),
          test_dataset.map(split_input_labels))

In [None]:
def dataset_preperation(text, labels, batch_size):
  features_tensor = encode(text.values.tolist())
  labels_tensor = tf.constant(labels, dtype=tf.int32)
  dataset = tf.data.Dataset.from_tensor_slices((features_tensor, labels_tensor))

  train, val, test = split_dataset(dataset)

  return (train.batch(batch_size),
          val.batch(batch_size),
          test.batch(batch_size))

## Subjectivity

In [None]:
df_subjectivity = pd.read_csv('/content/Clean Dataset/subjectivity-MPQA-All-News.csv')
features = df_subjectivity['content']
labels = df_subjectivity['is_subjective']

In [None]:
train_subjectivity, val_subjectivity, test_subjectivity = dataset_preperation(features, labels, SUBJ_BATCH_SIZE)

## Bias

In [None]:
df_bias = pd.read_csv('Clean Dataset/bias-filtered-reduced.csv')
features = df_bias['text']
labels = df_bias['is_biased']

In [None]:
train_bias, val_bias, test_bias = dataset_preperation(features, labels, BIAS_BATCH_SIZE)

## Neutrality

In [None]:
df_neutrality = pd.read_csv('Clean Dataset/netr-filtered.csv')
features = df_neutrality['text']
labels = df_neutrality[['is_left', 'is_center', 'is_right']]

In [None]:
train_neutrality, val_neutrality, test_neutrality = dataset_preperation(features, labels, NETR_BATCH_SIZE)

# Modeling

## Subjectivity

In [None]:
def create_subjectivity_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Conv1D(256, 3, input_shape=(768, 1), activation="relu"),
        layers.MaxPooling1D(),
        layers.Conv1D(256, 3, activation="relu"),
        layers.MaxPooling1D(),
        layers.Conv1D(256, 3, activation="relu"),
        layers.GlobalMaxPooling1D(),
        layers.Dense(256, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  # bert.trainable = False
  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  optimizer = tf.keras.optimizers.Adam(epsilon=1)

  model.compile(loss='binary_crossentropy',
                optimizer=optimizer,
                metrics=['accuracy'])

  return model

In [None]:
SUBJ_EPOCHS = 10
SUBJ_PATIENCE = 3
SUBJ_VERSION = 2

In [None]:
subjectivity_model = create_subjectivity_model(MAX_LEN)
subjectivity_model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                  patience=SUBJ_PATIENCE,
                                                  restore_best_weights=True)

subjectivity_model.fit(train_subjectivity, epochs=SUBJ_EPOCHS, validation_data = val_subjectivity,
                       callbacks=[early_stopping])

In [None]:
subjectivity_model.save(f'/content/drive/MyDrive/Capstone/Model/SV{SUBJ_VERSION}')

In [None]:
subjectivity_model.predict(test_subjectivity)

In [None]:
subjectivity_model.evaluate(test_subjectivity)

## Bias

In [None]:
def create_bias_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Dense(256, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  # bert.trainable = False
  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
BIAS_EPOCHS = 10
BIAS_PATIENCE = 3
BIAS_VERSION = 2

In [None]:
bias_model = create_bias_model(MAX_LEN)
bias_model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=BIAS_PATIENCE,
                                                      restore_best_weights=True)
bias_model.fit(train_bias, epochs=BIAS_EPOCHS, validation_data = val_bias,
                       callbacks=[early_stopping])

In [None]:
bias_model.save(f'/content/drive/MyDrive/Capstone/Model/BV{BIAS_VERSION}')

In [None]:
bias_model.predict(test_bias)

In [None]:
bias_model.evaluate(test_bias)

## Neutrality

In [None]:
def create_neutrality_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True).bert(input_ids, attention_mask)
  classifier = keras.Sequential([
        layers.Dense(32, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(3, activation='softmax')
  ])(bert.pooler_output)

  bert.trainable = False

  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
NETR_EPOCHS = 10
NETR_PATIENCE = 3
NETR_VERSION = 2

In [None]:
neutrality_model = create_neutrality_model(MAX_LEN)
neutrality_model.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss',
                                                      patience=NETR_PATIENCE,
                                                      restore_best_weights=True)
neutrality_model.fit(train_neutrality, epochs=NETR_EPOCHS, validation_data = val_neutrality,
                       callbacks=[early_stopping])

In [None]:
neutrality_model.save(f'/content/drive/MyDrive/Capstone/Model/NV{NETR_VERSION}')

In [None]:
neutrality_model.predict(test_neutrality)

In [None]:
neutrality_model.evaluate(test_neutrality)