# Installations and importations

In [None]:
!pip install transformers
!pip install -q sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
from torch import nn, Tensor
from torch.utils.data import DataLoader, random_split
from typing import Union, Tuple, List, Iterable, Dict

from transformers import BertTokenizer, BertModel
from sentence_transformers import SentenceTransformer, LoggingHandler, models, evaluation, losses
import sentence_transformers.util
from sentence_transformers.datasets import ParallelSentencesDataset
from sentence_transformers import SentenceTransformer, util
from sklearn.model_selection import train_test_split

from datetime import datetime
import random
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import csv
import gzip
from tqdm.autonotebook import tqdm
import zipfile
import io
import time

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
logging.basicConfig(level=logging.INFO)

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


In [None]:
from google.colab import drive
drive.mount('/content/drive')
Drive_path = '/content/drive/MyDrive/CS4248'
os.chdir(Drive_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Mock test:

In [None]:
teacher_model_name = 'paraphrase-distilroberta-base-v2'   #Our monolingual teacher model, we want to convert to multiple languages
student_model_name = 'bert-base-multilingual-cased'       #Multilingual base model we use to imitate the teacher model

In [None]:
data = pd.read_csv("./Data/ELCo.csv")
data

Unnamed: 0,English,Emoji,emoji_list
0,big business,[EM]necktie[EM]chart increasing[EM],"[':necktie:', ':chart_increasing:']"
1,big business,[EM]office building[EM]money-mouth face[EM]mon...,"[':office_building:', ':money-mouth_face:', ':..."
2,big business,[EM]man technologist[EM]handshake[EM],"[':man_technologist:', ':handshake:']"
3,big business,[EM]office building[EM]people holding hands[EM...,"[':office_building:', ':people_holding_hands:'..."
4,big business,[EM]woman technologist[EM]money-mouth face[EM],"[':woman_technologist:', ':money-mouth_face:']"
...,...,...,...
1658,sound advice,[EM]speaker high volume[EM]speaking head[EM],"[':speaker_high_volume:', ':speaking_head:']"
1659,sound advice,[EM]thumbs up light skin tone[EM]studio microp...,"[':thumbs_up_light_skin_tone:', ':studio_micro..."
1660,sound advice,[EM]boy[EM]speaking head[EM]woman[EM]thinking ...,"[':boy:', ':speaking_head:', ':woman:', ':thin..."
1661,sound advice,[EM]thumbs up[EM]deaf woman[EM],"[':thumbs_up:', ':deaf_woman:']"


## Pre-process:

In [None]:
# define a train and test dataset split method
def my_train_test_split(data, test_size):
  EN_unique = data['English'].unique()
  EN_length = len(EN_unique)
  threshold = int(np.floor(EN_length - EN_length * test_size))
  
  train = data[data['English'].isin(EN_unique[:threshold])]
  test = data[data['English'].isin(EN_unique[threshold:])]

  return train, test

# To test the model performance, generate several negative samples for each English phrase and calculate its matching accuracy
def generate_negative_samples(data, num_negative_samples=3):
    negative_samples = []
    test_list = []
    for index, row in data.iterrows():
        eng_phrase = row['English']
        negative_data = data.loc[data['English']!=eng_phrase, :]
        correct_Emoji = row['Emoji']
        Emoji_sets = list(negative_data['Emoji'].unique())
        incorrect_Emojis = random.sample(Emoji_sets, num_negative_samples)
        Emojis = [correct_Emoji] + incorrect_Emojis
        test_list.append([eng_phrase, Emojis])
    print(data.shape)
    return test_list

# Evaluate the accuracy of our model with test dataset
def Evaluate_performance(model, test):
  correct = 0
  phrase_size = len(test[0][1])
  for i in range(len(test)):
    embed_phrase = model.encode(test[i][0])
    scores = []
    for phrase in test[i][1]:
      embed_emoji = model.encode(phrase)
      scores.append(util.cos_sim(embed_phrase, embed_emoji))
    if np.argmax(scores)==0:
      correct += 1
  return correct / len(test)

In [None]:
# built-in train test split (with shuffle)
train, val = train_test_split(data, test_size=0.2, random_state=42)
train, test = train_test_split(train, test_size=0.2, random_state=42)

# defined train test split (without shuffle)
# train, test = my_train_test_split(data, test_size=0.2)
# train, val = my_train_test_split(train, test_size=0.2) 

In [None]:
output_folder = '/content/drive/MyDrive/CS4248'

with gzip.open(os.path.join(output_folder, 'my_train.tsv.gz'), 'wt', encoding='utf8') as fOut:
  n_examples = train.shape[0]
  print(f'{n_examples} training examples')
  # In parallel data loader, the teacher model provides the embeddings for the first column, while the student 
  # model tries to mimic the embeddings given by the teacher with the second column.
  ES = train['English'].tolist()
  EM = train['Emoji'].tolist()
  for i in range(n_examples):
    fOut.write("{}\t{}\t{}\n".format(ES[i], EM[i], ES[i]))

fOut.close()

with gzip.open(os.path.join(output_folder, 'my_val.tsv.gz'), 'wt', encoding='utf8') as fOut:
  n_examples = val.shape[0]
  print(f'{n_examples} validation examples')
  ES = val['English'].tolist()
  EM = val['Emoji'].tolist()
  for i in range(n_examples):
    fOut.write("{}\t{}\t{}\n".format(ES[i], EM[i], ES[i]))
    
fOut.close()

1064 training examples
333 validation examples


## Teacher and Student Model:

In [None]:
# Teacher Model
teacher_model = SentenceTransformer(teacher_model_name)

# Student Model
word_embedding_model = models.Transformer(student_model_name, max_seq_length=128)
# Apply mean pooling to get one fixed sized sentence vector
pooling_model = models.Pooling(word_embedding_model.get_word_embedding_dimension())
student_model = SentenceTransformer(modules=[word_embedding_model, pooling_model])

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/714M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

In [None]:
# Add [EM] token to the vocabulary
word_embedding_model = student_model._first_module()
tokens = ["[EM]"]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

Embedding(119548, 768)

## Read Parallel Sentences Dataset

In [None]:
###### Read Parallel Sentences Dataset ######
train_data = ParallelSentencesDataset(student_model=student_model, teacher_model=teacher_model, batch_size=64, use_embedding_cache=True)

train_files = []
train_files.append(os.path.join(output_folder, 'my_train.tsv.gz'))

for train_file in train_files:
    train_data.load_data(train_file, max_sentences=500000, max_sentence_length=250)
  
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=64)
train_loss = losses.MSELoss(model=student_model)

In [None]:
train_loss = losses.MSELoss(model=student_model)

## Evaluator:

In [None]:
# Define evaluator and save the validation results
evaluators = []
val_files = []
val_files.append(os.path.join(output_folder, 'my_val.tsv.gz'))

for val_file in val_files:
    en_sentences = []
    em_sentences = []
    with gzip.open(val_file, 'rt', encoding='utf8') as fIn:
        for line in fIn:
            splits = line.strip().split('\t')
            if splits[0] != "" and splits[1] != "":
                en_sentences.append(splits[0])
                em_sentences.append(splits[1])


    # Mean Squared Error (MSE) measures the (euclidean) distance between teacher(EN) and student(EM) embeddings
    em_mse = evaluation.MSEEvaluator(en_sentences, em_sentences, name=os.path.basename(val_file), teacher_model=teacher_model, batch_size=64)
    evaluators.append(em_mse)

    # Mean MSE between teacher(EN) and student(EN) embeddings
    en_mse = evaluation.MSEEvaluator(en_sentences, en_sentences, name=os.path.basename(val_file), teacher_model=teacher_model, batch_size=64)
    evaluators.append(en_mse)

    # # TranslationEvaluator computes the embeddings for all parallel sentences. It then check if the embedding of source[i] is the closest to target[i] out of all available target sentences
    # dev_trans_acc = evaluation.TranslationEvaluator(en_sentences, em_sentences, name=os.path.basename(val_file),batch_size=64)
    # evaluators.append(dev_trans_acc)



## Train:

In [None]:
# model_path="/content/drive/MyDrive/CS4248/output/test-2023-04-02_09-57-01"
# student_model = SentenceTransformer(model_path)

In [None]:
output_path = '/content/drive/MyDrive/CS4248/output/test'+"-"+datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

# Train the model

# Thoughts: Making two train objectives: 1 is for teacher-student english, 2 is for student eng -> student em
#          :param train_objectives: Tuples of (DataLoader, LossFunction). Pass more than one for multi-task learning
start_time = time.time()
student_model.fit(train_objectives=[(train_dataloader, train_loss)],
          evaluator=evaluation.SequentialEvaluator(evaluators, main_score_function=lambda scores: np.mean(scores)),
          epochs=100,
          warmup_steps=100,
          evaluation_steps=100,
          output_path=output_path,
          save_best_model=True,
          show_progress_bar = True,
          # optimizer_params= {'lr': 2e-5, 'eps': 1e-6, 'correct_bias': False}
          optimizer_params= {'lr': 2e-5, 'eps': 1e-6}
          )
end_time = time.time()

Epoch:   0%|          | 0/100 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

  labels = torch.tensor(labels)


Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

Iteration:   0%|          | 0/20 [00:00<?, ?it/s]

## Test:

In [None]:
test_phrases = generate_negative_samples(test)

In [None]:
inference_start=time.time()
print(Evaluate_performance(student_model, test_phrases))
inference_end = time.time()
print("Inference time: ", inference_end - inference_start)

In [None]:
print("Time: ", end_time-start_time)