## Creating functions

### (0) import necessary modules / packages

In [24]:
!pip install tensorflow transformers



In [25]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras import activations, optimizers, losses
from transformers import DistilBertTokenizer, TFDistilBertForSequenceClassification

In [26]:
BATCH_SIZE = 2
LEARNING_RATE = 3e-5
TOKEN_MAX_LEN = 50
TEST_SPLIT = 0.2

In [27]:
def get_X_and_y(df):
    X = df["pre_process_text"].tolist()
    y = df["classifier"].tolist()
    return X, y

### change "pre_process_text" to "preprocessed_text" or similar (in underlying pre-processed dataset)
### change "classifier" to "ideology" or similar (in underlying pre-processed dataset)
### these need to be converted to lists for use in later functions

### (1)	text_tokenizer
Tokenizes our pre-processed tex


In [28]:
# defining the tokenizer here so that it can be changed later
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

In [29]:
def text_tokenizer(X,
                   tokenizer,
                   max_len = TOKEN_MAX_LEN,
                   truncation = True,
                   padding = "max_length"):
    """
    Returns a dictionary of tokenized text with 2 keys: "input_ids" and "attention_mask".
    These 2 keys are required for the input to the DistilBert model.
    """
    
    tokens = tokenizer(X, max_length = max_len, truncation = truncation, padding = padding)
    
    return tokens

### (2)	tf_dataset_constructor
Takes the tokenized output from (1) and converts into TensorFlow objects for use in the DistilBert model


In [30]:
def tf_dataset_constructor(tokens,
                           y = None):
    """
    Using the tokenized input from the text_tokenizer function, returns TensorFlow objects for use in the DistilBert model.
    This was copied from an online tutorial on how to use the DistilBert model, so I don't fully understand it.
    """
    
    if y:
        tfdataset = tf.data.Dataset.from_tensor_slices((dict(tokens),y))
        
        return tfdataset
        
    else:
        # this case is used when making predictions on unseen samples after training
        tfdataset = tf.data.Dataset.from_tensor_slices(dict(tokens))
        
        return tfdataset
    

### (3) train_test_split
Splits our data into train and test sets for modelling purposes

In [31]:
def train_test_split(X,
                     tfdataset,
                     test_split = TEST_SPLIT,
                     batch_size = BATCH_SIZE):
    """
    This function splits the TensorFlow object created in the tf_dataset_constructor function into train and test sets.
    """
    
    train_size = int(len(X) * (1-test_split))
    
    # tfdataset = tfdataset.shuffle(len(X))
    tfdataset_train = tfdataset.take(train_size)
    tfdataset_test = tfdataset.skip(train_size)
    
    tfdataset_train = tfdataset_train.batch(batch_size)
    tfdataset_test = tfdataset_test.batch(batch_size)

    return tfdataset_train, tfdataset_test

###### WE ALSO NEED TO MAKE SURE THAT WE HAVE THE INDICES OF THE ITEMS IN THE TRAIN AND TEST DATASETS SO WE CAN MERGE THEIR PREDICTED PROBABILITIES
###### BACK INTO THE PRE-PROCESSED DATASET LATER!!!!

### (4)	ideology_model
This is the DistilBert model which we run on tfdataset_train to identify whether texts are left wing or right wing

In [32]:
def ideology_model(tfdataset_train,
                   model_name = "distilbert-base-uncased",
                   learning_rate = LEARNING_RATE,
                   batch_size = BATCH_SIZE,
                   epochs = 2):
    """
    Set up an run a DistilBert model on our TensorFlow training dataset.
    """
    
    # set up model
    model = TFDistilBertForSequenceClassification.from_pretrained(model_name)
    
    # define loss function
    loss = losses.SparseCategoricalCrossentropy(from_logits=True)

    # define optimizer to be used to minimise loss
    optimizer = optimizers.Adam(learning_rate)
    
    # compile model
    model.compile(optimizer = optimizer,
                  loss = loss,
                  metrics = "accuracy")
    
    # fit model
    model.fit(tfdataset_train, batch_size = batch_size, epochs = epochs)

    return model

## see if can add callbacks (patience) and validation splits etc.


### (5)	ideology_model_evaluator
Using the model output from the ideology_model function and evaluate it on the tfdataset_test to see how well the model predicts unseen texts as left wing or right wing


In [33]:
def ideology_model_evaluator(model,
                             tfdataset_test,
                             batch_size = BATCH_SIZE):
    """
    Evaluate our model on the TensorFlow test dataset
    """
    
    benchmarks = model.evaluate(tfdataset_test, batch_size = batch_size, return_dict = True)
    accuracy = benchmarks["accuracy"]
    
    return accuracy
    

### (6)	ideology_model_predictor
Using the model output from the ideology_model function, output the probabilities of each individual article being left or right wing (0 = left wing, 1 = right wing)

In [34]:
### currently the predictor does not work on the whole dataset, only when we input
# the train and test datasets individually :(

def ideology_model_predictor(model,
                             tfdataset_train,
                             tfdataset_test):
    """
    This function uses the model output from the ideology_model function to output the probabilities of each
    individual article being left or right wing (0 = left wing, 1 = right wing). As the model spits out log odds
    rather than probabilities, these also need to be converted in this function into probabilities
    """
    
    train_logits = model.predict(tfdataset_train)[0]
    train_probas = tf.nn.softmax(train_logits).numpy()

    test_logits = model.predict(tfdataset_test)[0]
    test_probas = tf.nn.softmax(test_logits).numpy()

    return train_probas, test_probas

#### need to find out a way to link these probabilities back to the individual articles they are relating to
