# Preperation

In [None]:
!pip install transformers
!pip install sentence-transformers
!gdown --folder "1cszZtjGiWoS5kJEU3cF-VZaXDYMiV7KR"

In [None]:
from transformers import BertTokenizer, TFBertModel, TFRobertaModel, RobertaTokenizer
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer
from nltk.cluster.util import cosine_distance
from sklearn.metrics import silhouette_score

import tensorflow as tf
import keras
from keras import layers, metrics
import pandas as pd
import numpy as np
import math
import matplotlib as plt
from google.colab import files

# Load Dataset

## Function & Global Variables

In [None]:
TRAIN_SIZE = 0.6
VAL_SIZE = 0.2
TEST_SIZE = 0.2
BATCH_SIZE = 1

bert_base = 'indolem/indobert-base-uncased'
sentence_bert_base = 'firqaaa/indo-sentence-bert-base'
tokenizer = BertTokenizer.from_pretrained(bert_base)
sentence_transformer = SentenceTransformer(sentence_bert_base)

In [None]:
def sentence_encode(text):
    return sentence_transformer(text)

In [None]:
def get_max_len(texts):
  max_len = 0
  for text in texts:
    input_ids = tokenizer(text, add_special_tokens=True)['input_ids']
    max_len = max(max_len, len(input_ids))

  # return 2 ** math.ceil(math.log2(max_len))
  # return max_len
  return min(2 ** math.ceil(math.log2(max_len)), 2048)

In [None]:
def encode(text, max_len):
    encoded_dict = tokenizer(text, add_special_tokens = True, max_length = max_len,
                             padding='max_length', return_attention_mask = True,
                             truncation=True, return_tensors = 'tf')

    input_ids = encoded_dict['input_ids']
    attention_masks = encoded_dict['attention_mask']
    return (input_ids, attention_masks)

In [None]:
def split_input_labels(features, labels):
    return {'input_ids': features[0], 'attention_mask': features[1]}, labels

In [None]:
def split_dataset(dataset):
  train_val_dataset, test_dataset = tf.keras.utils.split_dataset(
    dataset, left_size=TRAIN_SIZE+VAL_SIZE, right_size=TEST_SIZE, shuffle=True, seed=42
  )

  train_dataset, val_dataset = tf.keras.utils.split_dataset(
      train_val_dataset, left_size=TRAIN_SIZE, right_size=VAL_SIZE, shuffle=True, seed=42
  )

  return (train_dataset.map(split_input_labels),
          val_dataset.map(split_input_labels),
          test_dataset.map(split_input_labels))

In [None]:
def dataset_preperation(text, labels, max_len):
  features_tensor = encode(text.values.tolist(), max_len)
  labels_tensor = tf.constant(labels, dtype=tf.int32)
  dataset = tf.data.Dataset.from_tensor_slices((features_tensor, labels_tensor))

  train, val, test = split_dataset(dataset)

  return (train.batch(BATCH_SIZE),
          val.batch(BATCH_SIZE),
          test.batch(BATCH_SIZE))

## Clustering

In [None]:
df_clustering = pd.read_csv('news.csv')
train_clustering = sentence_encode(df_clustering['text'])

## Subjectivity

In [None]:
df_subjectivity = pd.read_csv('/content/Clean Dataset/subjectivity-MPQA-News.csv')
features = df_subjectivity['content']
max_len_subjectivity = get_max_len(features)
labels = df_subjectivity['is_subjective']

In [None]:
train_subjectivity, val_subjectivity, test_subjectivity = dataset_preperation(features, labels, max_len_subjectivity)

## Bias

In [None]:
df_bias = pd.read_csv('Clean Dataset/bias-neutrality.csv')
features = df_bias['teks']
max_len_bias = get_max_len(features)
labels = df_bias['is_biased']

In [None]:
train_bias, val_bias, test_bias = dataset_preperation(features, labels, max_len_bias)

## Neutrality

In [None]:
df_neutrality = pd.read_csv('Clean Dataset/bias-neutrality.csv')
features = df_neutrality['teks']
max_len_neutrality = get_max_len(features)
labels = df_neutrality[['is_left', 'is_center', 'is_right']]

In [None]:
train_neutrality, val_neutrality, test_neutrality = dataset_preperation(features, labels, max_len_neutrality)

# Modeling

## Clustering

In [None]:
def get_clusters(data, num_of_centroid=10):
  kclusterer = KMeansClusterer(num_of_centroid, distance=cosine_distance,
                               repeats=25, avoid_empty_clusters=True)

  return kclusterer.cluster(data, assign_clusters=True)

In [None]:
clusters = []
for i in range(1, 11):
  clusters.append(get_clusters(train_clustering, i))

In [None]:
silhouette = [silhouette_score(df_clustering['text'], cluster, metric='cosine') for cluster in clusters]
plt.plot(range(1, 11), silhouette)

In [None]:
best_idx = np.array(silhouette).argmax()
df_clustering['cluster'] = pd.Series(clusters[best_idx], index=df_clustering.index)

## Subjectivity

In [None]:
def create_subjectivity_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True)(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Dense(256, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  # bert.trainable = False
  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
subjectivity_model = create_subjectivity_model(max_len_subjectivity)
subjectivity_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model_7"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 2048)]               0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 2048)]               0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_7 (TFBertMod  TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 2048,                                      

In [None]:
subjectivity_model.fit(train_subjectivity, epochs=30, validation_data = val_subjectivity)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30

In [None]:
subjectivity_model.save('subjectivity_model.keras')
files.download('subjectivity_model.keras')

## Bias

In [None]:
def create_bias_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True)(input_ids, attention_mask)

  classifier = keras.Sequential([
        layers.Dense(256, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(1, activation='sigmoid')
  ])(bert.pooler_output)

  # bert.trainable = False
  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
bias_model = create_bias_model(max_len_bias)
bias_model.summary()

pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model (TFBertModel  TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
 )                           ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                       

In [None]:
bias_model.fit(train_bias, epochs=30, validation_data = val_bias)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30


In [None]:
bias_model.save('bias_model.keras')
files.download('bias_model.keras')

## Neutrality

In [None]:
def create_neutrality_model(max_len):
  input_ids = layers.Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
  attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")

  bert = TFBertModel.from_pretrained(bert_base, from_pt=True)(input_ids, attention_mask)
  classifier = keras.Sequential([
        layers.Dense(32, input_shape=(768,), activation="relu"),
        layers.Dense(32, activation="relu"),
        layers.Dense(3, activation='softmax')
  ])(bert.pooler_output)

  bert.trainable = False

  model = keras.Model(inputs=[input_ids, attention_mask],
                     outputs=classifier)

  model.compile(loss='categorical_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

  return model

In [None]:
neutrality_model = create_neutrality_model(max_len_neutrality)
neutrality_model.summary()

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predict

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_ids (InputLayer)      [(None, 128)]                0         []                            
                                                                                                  
 attention_mask (InputLayer  [(None, 128)]                0         []                            
 )                                                                                                
                                                                                                  
 tf_bert_model_1 (TFBertMod  TFBaseModelOutputWithPooli   1105582   ['input_ids[0][0]',           
 el)                         ngAndCrossAttentions(last_   08         'attention_mask[0][0]']      
                             hidden_state=(None, 128, 7                                     

In [None]:
neutrality_model.fit(train_neutrality, epochs=30, validation_data = val_neutrality)