In [1]:
import tensorflow as tf
from tensorflow.keras import optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification
from tensorflow.keras.callbacks import EarlyStopping

2023-12-01 17:16:59.822641: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-12-01 17:16:59.871051: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-01 17:17:00.137906: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-01 17:17:00.137951: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-01 17:17:00.185205: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to

In [38]:
IM_MODEL_NAME = "distilbert-base-uncased"
IM_BATCH_SIZE = 2
IM_LEARNING_RATE = 3e-5
IM_TOKEN_MAX_LEN = 50   ### currently set at 50 to speed up basic model training
IM_TEST_SPLIT = 0.2
IM_VALIDATION_SPLIT = 0.3   ### refers to split withing training data (not whole dataset)
IM_EPOCHS = 5   ### currently set to 5 to speed up basic model training
IM_PATIENCE = 2   ### currently set to 2 due to the low number of epochs (5)

In [2]:
import pandas as pd

In [12]:
df = pd.read_csv('/home/connor/code/zulu-tango/news_and_echo_bubbles/raw_data/cleaned.csv', index_col=0)

In [13]:
# drop irrelevant columns and take a sample of 50 to test
df = df.drop(columns = ["link", "pdate", "title", "author", "text", "keywords", "tags", "compound",\
                            "neg", "neu", "pos", "polarity", "subjectivity", "time", "urls"]).sample(50)

In [14]:
df.head()

Unnamed: 0,classifier,pre_process_text
245,0,a trump statue has caught on with china’s onli...
2219,1,sen ben sasse on capitol hill in washington ...
2347,1,rising domestic supply and great efficiency ga...
2117,1,republican south dakota gov kristi noem on sa...
2418,1,president biden’s build back better campaign...


In [15]:
continuous_list = [-1.89, -1.02, 1.6, 3.65, 2.8, -2.47, 1.39, 2.78, -3.65, -3.4, -3.48, 2.44, -4.07, -4.28, 4.71, 3.03, -0.68, 3.05, 0.06,\
                   4.86, 0.31, 2.18, -0.31, -3.47, -1.99, 1.81, -0.91, -1.17, 3.34, 1.41, -2.23, -3.17, 2.58, -3.13, -1.23, 0.56, 4.48, 3.48,\
                   -2.48, 3.37, -4.81, -1.84, -0.42, 4.94, 4.14, -1.59, 0.16, -4.29, -4.24, -4.23]

In [16]:
len(continuous_list)

50

In [17]:
discrete_list = [-1, -2, 2, 0, 2, 0, 1, 0, 0, 2, 2, 1, 2, 0, -1, 2, -2, -2, -1, -2, 2, 1, 2, 1, 1, -1, -2, 2, 1, -2, -2, 2, -1, 0, 0,\
                 -1, -1, -2, 0, 1, -1, 1, 2, -1, 0, 1, 2, 2, 0, 0]

In [18]:
len(discrete_list)

50

In [19]:
df["continuous_bias_score"] = continuous_list
df["discrete_bias_score"] = discrete_list
df.head()

Unnamed: 0,classifier,pre_process_text,continuous_bias_score,discrete_bias_score
245,0,a trump statue has caught on with china’s onli...,-1.89,-1
2219,1,sen ben sasse on capitol hill in washington ...,-1.02,-2
2347,1,rising domestic supply and great efficiency ga...,1.6,2
2117,1,republican south dakota gov kristi noem on sa...,3.65,0
2418,1,president biden’s build back better campaign...,2.8,2


In [20]:
### TO DO: MAKE THE MODEL WORK USING THE "CONTINUOUS_BIAS_SCORE" AND "DISCRETE_BIAS_SCORE" COLUMNS
### THE OUTPUT SHOULD STILL BE A PROBABILITY FROM 0 TO 1

In [21]:
from tensorflow.keras.utils import to_categorical

In [22]:
one_hot_dict = {-2:[1,0,0,0,0],
                -1:[0,1,0,0,0],
                0:[0,0,1,0,0],
                1:[0,0,0,1,0],
                2:[0,0,0,0,1]}

In [23]:
df["one_hot_discrete"] = df["discrete_bias_score"].map(one_hot_dict)

In [24]:
df.head()

Unnamed: 0,classifier,pre_process_text,continuous_bias_score,discrete_bias_score,one_hot_discrete
245,0,a trump statue has caught on with china’s onli...,-1.89,-1,"[0, 1, 0, 0, 0]"
2219,1,sen ben sasse on capitol hill in washington ...,-1.02,-2,"[1, 0, 0, 0, 0]"
2347,1,rising domestic supply and great efficiency ga...,1.6,2,"[0, 0, 0, 0, 1]"
2117,1,republican south dakota gov kristi noem on sa...,3.65,0,"[0, 0, 1, 0, 0]"
2418,1,president biden’s build back better campaign...,2.8,2,"[0, 0, 0, 0, 1]"


In [None]:
def NEW_get_X_and_y(df):
    """
    Gets from our dataset: (i) the feature (i.e. X - the pre-processed text);
    and (ii) the target (i.e. y - the ideology: left wing = 0 / right wing = 1).
    These need to be converted into lists for use in our model.
    """

    X = df["pre_process_text"].tolist()
    y = df["one_hot_discrete"].tolist()

    return X, y

In [None]:
X, y = NEW_get_X_and_y(df)

In [None]:
tokenizer = instantiate_tokenizer()

In [None]:
tokens = text_tokenizer(X, tokenizer)

In [None]:
tfdataset = tf_dataset_constructor(tokens, y)

In [None]:
tfdataset_train, tfdataset_val, tfdataset_test = train_test_split(X, tfdataset)

In [None]:
def NEW_ideology_model(tfdataset_train,
                   tfdataset_val,
                   model_name = IM_MODEL_NAME,
                   learning_rate = IM_LEARNING_RATE,
                   batch_size = IM_BATCH_SIZE,
                   epochs = IM_EPOCHS,
                   patience = IM_PATIENCE):

    """
    Set up an run a DistilBert model on our TensorFlow training dataset.
    """

    num_labels = 5  ################

    # set up model
    model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    # define loss function
    loss = losses.BinaryCrossentropy(from_logits=True)

    # define optimizer to be used to minimise loss
    optimizer = optimizers.Adam(learning_rate)

    # compile model
    model.compile(optimizer = optimizer,
                  loss = loss,
                  metrics = "accuracy")

    # fit model
    model.fit(tfdataset_train,
              batch_size = batch_size,
              epochs = epochs,
              validation_data = tfdataset_val,
              callbacks = EarlyStopping(patience = patience, restore_best_weights = True))

    return model

In [None]:
model = NEW_ideology_model(tfdataset_train,
                   tfdataset_val)

In [None]:
accuracy = ideology_model_evaluator(model, tfdataset_test)

In [None]:
pred_probas = ideology_model_predictor(model,tokens)

In [None]:
pred_probas

In [None]:
import numpy as np

In [None]:
top_class_list = []

for row in range(len(pred_probas)):
    
    conversion_dict = {0 : "left",
                       1 : "leans left",
                       2 : "centre",
                       3 : "leans right",
                       4 : "right"}
    
    top_class_list.append(conversion_dict[np.argmax(pred_probas[row])])

print(top_class_list)



## TESTING IN FULL!!!!

In [25]:
df = df.drop(columns = ["continuous_bias_score", "one_hot_discrete"])

In [26]:
df["5_step_classifier"] = df["discrete_bias_score"]

In [27]:
df.head()

Unnamed: 0,classifier,pre_process_text,discrete_bias_score,5_step_classifier
245,0,a trump statue has caught on with china’s onli...,-1,-1
2219,1,sen ben sasse on capitol hill in washington ...,-2,-2
2347,1,rising domestic supply and great efficiency ga...,2,2
2117,1,republican south dakota gov kristi noem on sa...,0,0
2418,1,president biden’s build back better campaign...,2,2


In [28]:
import numpy as np

In [29]:
n = 5

three_hot_dict = {-1 : [1,0,0],
                  0 : [0,1,0],
                  1 : [0,0,1]}

five_hot_dict = {-2 : [1,0,0,0,0],
                 -1 : [0,1,0,0,0],
                 0 : [0,0,1,0,0],
                 1 : [0,0,0,1,0],
                 2 : [0,0,0,0,1]}

In [39]:
def bias_score_encoding(df, n):

    if n == 5:
        df["one_hot_discrete"] = df["5_step_classifier"].map(five_hot_dict)

    if n == 3:
        df["one_hot_discrete"] = df["3_step_classifier"].map(three_hot_dict)

    return df

In [40]:
def n_class_get_X_and_y(df):
    """
    xxxxxxxxxx
    """

    X = df["pre_process_text"].tolist()
    y = df["one_hot_discrete"].tolist()

    return X, y

In [41]:
def instantiate_tokenizer(model_name = IM_MODEL_NAME):
    """
    Define the tokenizer we want to use in our modelling.
    """

    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    return tokenizer

In [42]:
def text_tokenizer(X,
                   tokenizer,
                   max_len = IM_TOKEN_MAX_LEN,
                   truncation = True,
                   padding = "max_length"):
    """
    Returns a dictionary of tokenized text with 2 keys: "input_ids" and "attention_mask".
    These 2 keys are required for the input into the DistilBert model.
    """

    tokens = tokenizer(X, max_length = max_len, truncation = truncation, padding = padding)

    return tokens

In [43]:
def tf_dataset_constructor(tokens,
                           y):
    """
    Using the tokenized input from the text_tokenizer function,
    returns TensorFlow objects for use in the DistilBert model.
    """

    tfdataset = tf.data.Dataset.from_tensor_slices((dict(tokens),y))

    return tfdataset

In [44]:
def train_test_split(X,
                     tfdataset,
                     test_split = IM_TEST_SPLIT,
                     val_split = IM_VALIDATION_SPLIT,
                     batch_size = IM_BATCH_SIZE):
    """
    This function splits the TensorFlow object created in the tf_dataset_constructor function
    into train, valdiation and test sets.
    """

    # get the sizes of the train and validation sets
    train_size = int(len(X) * (1-test_split))
    val_size = int(train_size * val_split)

    # shuffle the full dataset
    tfdataset = tfdataset.shuffle(len(X))

    # from the full datset, get out the train, validation and test sets
    tfdataset_train = tfdataset.take(train_size)
    tfdataset_val = tfdataset.skip(train_size - val_size).take(val_size)
    tfdataset_test = tfdataset.skip(train_size)

    # batch the train, validation and test sets
    tfdataset_train = tfdataset_train.batch(batch_size)
    tfdataset_val = tfdataset_val.batch(batch_size)
    tfdataset_test = tfdataset_test.batch(batch_size)

    return tfdataset_train, tfdataset_val, tfdataset_test

In [45]:
def n_class_ideology_model(tfdataset_train,
                           tfdataset_val,
                           n,
                           model_name = IM_MODEL_NAME,
                           learning_rate = IM_LEARNING_RATE,
                           batch_size = IM_BATCH_SIZE,
                           epochs = IM_EPOCHS,
                           patience = IM_PATIENCE):

    """
    Set up an run a DistilBert model on our TensorFlow training dataset.
    """

    # set up model
    model = TFDistilBertForSequenceClassification.from_pretrained(model_name, num_labels = n)

    # define loss function
    loss = losses.BinaryCrossentropy(from_logits=True)

    # define optimizer to be used to minimise loss
    optimizer = optimizers.Adam(learning_rate)

    # compile model
    model.compile(optimizer = optimizer,
                  loss = loss,
                  metrics = "accuracy")

    # fit model
    model.fit(tfdataset_train,
              batch_size = batch_size,
              epochs = epochs,
              validation_data = tfdataset_val,
              callbacks = EarlyStopping(patience = patience, restore_best_weights = True))

    return model

In [46]:
def ideology_model_predictor(model,
                             tokens):
    """
    This function uses the model output from the ideology_model function to output the
    probabilities of each individual article being left or right wing (0 = left wing,
    1 = right wing). As the model spits out log odds rather than probabilities, these
    also need to be converted in this function into probabilities.
    """

    # firstly create a TensorFlow version of our tokenized dataset without our y
    tfdataset_no_y = tf.data.Dataset.from_tensor_slices(dict(tokens))

    # use this to get out the logits for our model
    pred_logits = model.predict(tfdataset_no_y)[0]

    # convert these into probabilties
    pred_probas = tf.nn.softmax(pred_logits).numpy()

    return pred_probas

In [47]:
def top_class(pred_probas):

    top_class_list = []

    for row in range(len(pred_probas)):

        conversion_dict = {0 : "left",
                       1 : "leans left",
                       2 : "centre",
                       3 : "leans right",
                       4 : "right"}

        top_class_list.append(conversion_dict[np.argmax(pred_probas[row])])

    return top_class_list

In [48]:
def full_n_class_ideology_model(df, n):

    df = bias_score_encoding(df, n)

    X, y = n_class_get_X_and_y(df)

    tokenizer = instantiate_tokenizer(model_name = IM_MODEL_NAME)

    tokens = text_tokenizer(X,
                            tokenizer,
                            max_len = IM_TOKEN_MAX_LEN,
                            truncation = True,
                            padding = "max_length")

    tfdataset = tf_dataset_constructor(tokens, y)

    # the following function automatically returns the test dataset, even though this is
    # not used further, as we do not evaluate the model accuracy within this function.

    tfdataset_train, tfdataset_val, tfdataset_test =\
    train_test_split(X,
                    tfdataset,
                    test_split = IM_TEST_SPLIT,
                    val_split = IM_VALIDATION_SPLIT,
                    batch_size = IM_BATCH_SIZE)

    model = n_class_ideology_model(tfdataset_train,
                           tfdataset_val,
                           n,
                           model_name = IM_MODEL_NAME,
                           learning_rate = IM_LEARNING_RATE,
                           batch_size = IM_BATCH_SIZE,
                           epochs = IM_EPOCHS,
                           patience = IM_PATIENCE)

    pred_probas = ideology_model_predictor(model, tokens)

    top_class_list = top_class(pred_probas)

    df['pred_class'] = top_class_list

    return df

In [49]:
output_df = full_n_class_ideology_model(df,5)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [50]:
output_df

Unnamed: 0,classifier,pre_process_text,discrete_bias_score,5_step_classifier,one_hot_discrete,pred_class
245,0,a trump statue has caught on with china’s onli...,-1,-1,"[0, 1, 0, 0, 0]",leans left
2219,1,sen ben sasse on capitol hill in washington ...,-2,-2,"[1, 0, 0, 0, 0]",centre
2347,1,rising domestic supply and great efficiency ga...,2,2,"[0, 0, 0, 0, 1]",right
2117,1,republican south dakota gov kristi noem on sa...,0,0,"[0, 0, 1, 0, 0]",centre
2418,1,president biden’s build back better campaign...,2,2,"[0, 0, 0, 0, 1]",right
1955,1,happy anniversary it s one year since america...,0,0,"[0, 0, 1, 0, 0]",centre
1196,0,let our journalists help you make sense of the...,1,1,"[0, 0, 0, 1, 0]",leans right
393,0,during the trump years there was an excessive...,0,0,"[0, 0, 1, 0, 0]",centre
2602,1,when it comes time to retire and enjoy those g...,0,0,"[0, 0, 1, 0, 0]",centre
272,0,crises have a way of sorting the good presiden...,2,2,"[0, 0, 0, 0, 1]",right


## The below is a working model based on binary classification only

In [None]:
IM_MODEL_NAME = "distilbert-base-uncased"
IM_BATCH_SIZE = 2
IM_LEARNING_RATE = 3e-5
IM_TOKEN_MAX_LEN = 50   ### currently set at 50 to speed up basic model training
IM_TEST_SPLIT = 0.2
IM_VALIDATION_SPLIT = 0.3   ### refers to split withing training data (not whole dataset)
IM_EPOCHS = 5   ### currently set to 5 to speed up basic model training
IM_PATIENCE = 2   ### currently set to 2 due to the low number of epochs (5)

In [None]:
def get_X_and_y(df):
    """
    Gets from our dataset: (i) the feature (i.e. X - the pre-processed text);
    and (ii) the target (i.e. y - the ideology: left wing = 0 / right wing = 1).
    These need to be converted into lists for use in our model.
    """

    X = df["pre_process_text"].tolist()
    y = df["classifier"].tolist()

    return X, y

In [None]:
def instantiate_tokenizer(model_name = IM_MODEL_NAME):
    """
    Define the tokenizer we want to use in our modelling.
    """

    tokenizer = DistilBertTokenizer.from_pretrained(model_name)

    return tokenizer

In [None]:
def text_tokenizer(X,
                   tokenizer,
                   max_len = IM_TOKEN_MAX_LEN,
                   truncation = True,
                   padding = "max_length"):
    """
    Returns a dictionary of tokenized text with 2 keys: "input_ids" and "attention_mask".
    These 2 keys are required for the input into the DistilBert model.
    """

    tokens = tokenizer(X, max_length = max_len, truncation = truncation, padding = padding)

    return tokens

In [None]:
def tf_dataset_constructor(tokens,
                           y):
    """
    Using the tokenized input from the text_tokenizer function,
    returns TensorFlow objects for use in the DistilBert model.
    """

    tfdataset = tf.data.Dataset.from_tensor_slices((dict(tokens),y))

    return tfdataset


In [None]:
def train_test_split(X,
                     tfdataset,
                     test_split = IM_TEST_SPLIT,
                     val_split = IM_VALIDATION_SPLIT,
                     batch_size = IM_BATCH_SIZE):
    """
    This function splits the TensorFlow object created in the tf_dataset_constructor function
    into train, valdiation and test sets.
    """

    # get the sizes of the train and validation sets
    train_size = int(len(X) * (1-test_split))
    val_size = int(train_size * val_split)

    # shuffle the full dataset
    tfdataset = tfdataset.shuffle(len(X))

    # from the full datset, get out the train, validation and test sets
    tfdataset_train = tfdataset.take(train_size)
    tfdataset_val = tfdataset.skip(train_size - val_size).take(val_size)
    tfdataset_test = tfdataset.skip(train_size)

    # batch the train, validation and test sets
    tfdataset_train = tfdataset_train.batch(batch_size)
    tfdataset_val = tfdataset_val.batch(batch_size)
    tfdataset_test = tfdataset_test.batch(batch_size)

    return tfdataset_train, tfdataset_val, tfdataset_test

In [None]:
def ideology_model(tfdataset_train,
                   tfdataset_val,
                   model_name = IM_MODEL_NAME,
                   learning_rate = IM_LEARNING_RATE,
                   batch_size = IM_BATCH_SIZE,
                   epochs = IM_EPOCHS,
                   patience = IM_PATIENCE):

    """
    Set up an run a DistilBert model on our TensorFlow training dataset.
    """

    # set up model
    model = TFDistilBertForSequenceClassification.from_pretrained(model_name)

    # define loss function
    loss = losses.SparseCategoricalCrossentropy(from_logits=True)

    # define optimizer to be used to minimise loss
    optimizer = optimizers.Adam(learning_rate)

    # compile model
    model.compile(optimizer = optimizer,
                  loss = loss,
                  metrics = "accuracy")

    # fit model
    model.fit(tfdataset_train,
              batch_size = batch_size,
              epochs = epochs,
              validation_data = tfdataset_val,
              callbacks = EarlyStopping(patience = patience, restore_best_weights = True))

    return model

In [None]:
def ideology_model_evaluator(model,
                             tfdataset_test,
                             batch_size = IM_BATCH_SIZE):
    """
    Evaluate our model on the TensorFlow test dataset
    """

    benchmarks = model.evaluate(tfdataset_test, batch_size = batch_size, return_dict = True)
    accuracy = benchmarks["accuracy"]

    return accuracy

In [None]:
def ideology_model_predictor(model,
                             tokens):
    """
    This function uses the model output from the ideology_model function to output the
    probabilities of each individual article being left or right wing (0 = left wing,
    1 = right wing). As the model spits out log odds rather than probabilities, these
    also need to be converted in this function into probabilities.
    """

    # firstly create a TensorFlow version of our tokenized dataset without our y
    tfdataset_no_y = tf.data.Dataset.from_tensor_slices(dict(tokens))

    # use this to get out the logits for our model
    pred_logits = model.predict(tfdataset_no_y)[0]

    # convert these into probabilties
    pred_probas = tf.nn.softmax(pred_logits).numpy()

    return pred_probas

In [None]:
def full_ideology_model(df):
    """
    Combine all above functions into one master function, except for the
    ideology_model_evaluator function, as we do not need the accuracy output here.
    """

    X, y = get_X_and_y(df)

    tokenizer = instantiate_tokenizer(model_name = IM_MODEL_NAME)

    tokens = text_tokenizer(X,
                            tokenizer,
                            max_len = IM_TOKEN_MAX_LEN,
                            truncation = True,
                            padding = "max_length")

    tfdataset = tf_dataset_constructor(tokens, y)

    # the following function automatically returns the test dataset, even though this is
    # not used further, as we do not evaluate the model accuracy within this function.

    tfdataset_train, tfdataset_val, tfdataset_test =\
    train_test_split(X,
                    tfdataset,
                    test_split = IM_TEST_SPLIT,
                    val_split = IM_VALIDATION_SPLIT,
                    batch_size = IM_BATCH_SIZE)

    model = ideology_model(tfdataset_train,
                           tfdataset_val,
                           model_name = IM_MODEL_NAME,
                           learning_rate = IM_LEARNING_RATE,
                           batch_size = IM_BATCH_SIZE,
                           epochs = IM_EPOCHS,
                           patience = IM_PATIENCE)


    pred_probas = ideology_model_predictor(model, tokens)

    # from the predicted probabilities, we want the second column, which shows the probability
    # of the article being right-wing - a score near to 1 is very right wing; a score near to 0
    # is very left wing. We then add this column onto our df and return the full df.

    df['pred_probas'] = pred_probas[:,1]

    return df

In [None]:
output_df = full_ideology_model(df)

In [None]:
output_df.head(10)