In [1]:
!unzip /content/my_folder.zip -d /content/output_dir

In [2]:
!pip install -r requirements.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import os
import json
import csv
import pandas as pd

def process_data(input_csv, json_directory, output_csv):
    json_files = {}

    for root, _, files in os.walk(json_directory):
        for _file in files:
            if _file.endswith(".json"):
                _file_name = os.path.splitext(_file)[0]
                json_files[_file_name] = os.path.join(root, _file)

    with open(input_csv) as f_in, open(output_csv, 'w') as f_out:
        reader = csv.reader(f_in)
        writer = csv.writer(f_out)
        next(reader)

        # Add labels to the header
        labels = ['pair_id', 'text1', 'text2', 'Geography', 'Entities', 'Time', 'Narrative', 'Overall', 'Style', 'Tone']
        writer.writerow(labels)

        for row in reader:
            id1, id2 = row[2].split("_")
            text_1 = text_2 = None

            if id1 in json_files:
                with open(json_files[id1]) as f1:
                    data1 = json.load(f1)
                    text_1 = [data1['title'] + '  [SEP]  ' + data1['text']]

            if id2 in json_files:
                with open(json_files[id2]) as f2:
                    data2 = json.load(f2)
                    text_2 = [data2['title'] + '  [SEP]  ' + data2['text']]

            if text_1 and text_2:
                writer.writerow(row[:3] + text_1 + text_2 + row[7:])


process_data('/content/semeval-2022_task8_train-data_batch.csv', '/content/output_dir', 'train.csv')

train_data = pd.read_csv('/content/train.csv')
train_data = train_data.dropna(subset=['text1', 'text2']).astype({'text1': 'str', 'text2': 'str'})

In [4]:
def save_model_summary_to_file(model, file_name):
    with open(file_name, 'w') as f:
        # Define a function to write the summary lines to the file
        def write_to_file(line):
            f.write(line + '\n')

        # Call the model.summary() method with the custom print function
        model.summary(print_fn=write_to_file)

In [5]:
import re


class Preprocessor:
    def __init__(self, punctuation=True, url=True, number=True):
        self.punctuation = punctuation
        self.url = url
        self.number = number

    def apply(self, sentence: str) -> str:
        sentence = sentence.lower()
        sentence = sentence.replace('<unk>', '')
        if self.url:
            sentence = Preprocessor.remove_url(sentence)
        if self.punctuation:
            sentence = Preprocessor.remove_punctuation(sentence)
        if self.number:
            sentence = Preprocessor.remove_number(sentence)
        sentence = re.sub(r'\s+', ' ', sentence)
        return sentence

    @staticmethod
    def remove_punctuation(sentence: str) -> str:
        sentence = re.sub(r'[^\w\s]', ' ', sentence)
        return sentence

    @staticmethod
    def remove_url(sentence: str) -> str:
        sentence = re.sub(r'(https|http)?://(\w|\.|/|\?|=|&|%)*\b', ' ', sentence)
        return sentence

    @staticmethod
    def remove_number(sentence: str) -> str:
        sentence = re.sub(r'\d+', ' ', sentence)
        return sentence

# Clean the text data
preprocessor = Preprocessor()
train_data['text1'] = train_data['text1'].apply(preprocessor.apply)
train_data['text2'] = train_data['text2'].apply(preprocessor.apply)

In [6]:
!pip install transformers
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import tensorflow as tf
from transformers import TFRobertaModel, XLMRobertaTokenizer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from matplotlib import pyplot as plt

# Load the XLM-RoBERTa model and tokenizer
roberta_model = TFRobertaModel.from_pretrained('xlm-roberta-base')
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

# Split the training data into training and validation sets
train_set, valid_set = train_test_split(train_data, test_size=0.2, random_state=42)

# Tokenize and encode the news articles using the XLM-RoBERTa tokenizer
encoded_articles1_train = tokenizer(train_set['text1'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')
encoded_articles2_train = tokenizer(train_set['text2'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')
encoded_articles1_valid = tokenizer(valid_set['text1'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')
encoded_articles2_valid = tokenizer(valid_set['text2'].tolist(), padding=True, truncation=True, max_length=512, return_tensors='tf')

# Pass the encoded articles through the XLM-RoBERTa model
outputs1_train = roberta_model(encoded_articles1_train.input_ids)
outputs2_train = roberta_model(encoded_articles2_train.input_ids)
outputs1_valid = roberta_model(encoded_articles1_valid.input_ids)
outputs2_valid = roberta_model(encoded_articles2_valid.input_ids)

# Extract the embedding vectors for each article
embeddings1_train = outputs1_train[0][:, 0, :]
embeddings2_train = outputs2_train[0][:, 0, :]
embeddings1_valid = outputs1_valid[0][:, 0, :]
embeddings2_valid = outputs2_valid[0][:, 0, :]

# Define the similarity function
similarity_function = tf.keras.layers.Dot(axes=1, normalize=True)

learning_rate = 0.001
hidden_units = 128
num_of_epochs = 100
size_of_batch = 32

# Define the Siamese network architecture
input1 = tf.keras.layers.Input(shape=(roberta_model.config.hidden_size,))
input2 = tf.keras.layers.Input(shape=(roberta_model.config.hidden_size,))
x1 = tf.keras.layers.Dense(hidden_units, activation='relu', kernel_regularizer='l2')(input1)
x2 = tf.keras.layers.Dense(hidden_units, activation='relu', kernel_regularizer='l2')(input2)
x = similarity_function([x1, x2])
output = tf.keras.layers.Dense(1, activation='linear')(x)
model = tf.keras.models.Model(inputs=[input1, input2], outputs=output)

# Compile the model
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate), loss='mse')

# Train the model on the training data
history = model.fit([embeddings1_train, embeddings2_train], train_set['Overall'], epochs=num_of_epochs, batch_size=size_of_batch, validation_data=([embeddings1_valid, embeddings2_valid], valid_set['Overall']))

plt.plot(np.array(history.history['loss']), label="train")
plt.plot(np.array(history.history['val_loss']), label="validation")
plt.xlabel("Epochs")
plt.ylabel("Loss")
plt.title("Training with Dropout: 0.01 Epoch: 500, learning Rate:1e-5, momentum:0.99 ,epsilon:1e-3, Adam optimizer")
plt.legend()
plt.show()

similarity_scores_train = model.predict([embeddings1_train, embeddings2_train])
similarity_scores_train = np.round(similarity_scores_train).clip(1, 4)
mae_train = mean_absolute_error(train_set['Overall'], similarity_scores_train)

# Predict the similarity scores for the validation data
similarity_scores_valid = model.predict([embeddings1_valid, embeddings2_valid])
similarity_scores_valid = np.round(similarity_scores_valid).clip(1, 4)
mae_valid = mean_absolute_error(valid_set['Overall'], similarity_scores_valid)

save_model_summary_to_file(model, 'network.txt')

model.save('/content/saved_model')

You are using a model of type xlm-roberta to instantiate a model of type roberta. This is not supported for all configurations of models and can yield errors.
All model checkpoint layers were used when initializing TFRobertaModel.

All the layers of TFRobertaModel were initialized from the model checkpoint at xlm-roberta-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFRobertaModel for predictions without further training.
