# Sentence Transformer

In [72]:
import copy
import torch
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import DebertaV2Tokenizer, DebertaV2Model

from torch.utils.data import Dataset, DataLoader
from datasets import load_from_disk

In [2]:
import re
import string

In [3]:
import textstat
from spellchecker import SpellChecker
from lexical_density import lexical_density as Lexical_Density
from TF_IDF import calc_tf_idf
from Add_Periods import Add_Periods
from avg_word_sentence_length import average_word_length, average_sentence_length


## GPU

In [4]:
!nvidia-smi

Sat Sep 30 16:06:11 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 536.99                 Driver Version: 536.99       CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce GTX 1070      WDDM  | 00000000:01:00.0  On |                  N/A |
| 28%   39C    P8              11W / 151W |    701MiB /  8192MiB |      1%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
# Check if CUDA is available, else use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cpu')

## Utils

In [6]:
def spell_checker_score(text, debug = False):
    def remove_punctuation(st):
        return re.sub(f"[{re.escape(string.punctuation)}]", '', st)
    
    spell = SpellChecker()
    misspelled = spell.unknown(remove_punctuation(text).split())

    if debug:
        for word in misspelled:
            # Get the one `most likely` answer
            print(f"{word} => {spell.correction(word)}")
        
        print(f"misspelled: {len(misspelled)}")
    return len(misspelled)

In [73]:
class SentenceTransformer(torch.nn.Module):
    def __init__(self):
        super(SentenceTransformer, self).__init__()
        # Load the tokenizer and model
        self.tokenizer = DebertaV2Tokenizer.from_pretrained('microsoft/deberta-v3-base')
        self.model = DebertaV2Model.from_pretrained('microsoft/deberta-v3-base')

    def encode(self, sentences):
        # Tokenize sentences
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
        # Compute token embeddings
        with torch.no_grad():
            model_output = self.model(**encoded_input)
            
        # Only take the embeddings of the [CLS] token (or use the mean of the token embeddings if desired)
        sentence_embeddings = model_output.last_hidden_state[:, 0, :]
        
        return sentence_embeddings

In [74]:
deberta = SentenceTransformer()

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [75]:

# Example usage:
text = "Hugging Face is creating a tool that democratizes AI."
embedding = get_embedding(text)
print(embedding[:, :5])
print(deberta.encode(text)[:, :5])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


tensor([[ 0.1804,  0.2592,  0.0357, -0.1198,  0.1036]])
tensor([[ 0.1804,  0.2592,  0.0357, -0.1198,  0.1036]])


In [78]:
len(deberta.tokenizer(text)['input_ids']), deberta.tokenizer(text)

(14,
 {'input_ids': [1, 88419, 510, 8834, 269, 1512, 266, 1637, 272, 92146, 268, 5536, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

## Generate dataset and batch

In [None]:
class PromptDataset(Dataset):

    def __init__(self, dataset, device):
        self.dataset = dataset
        self.device = device

    def deserialize_array(self, binary_string, dtype, shape):
        return np.frombuffer(binary_string, dtype=dtype).reshape(shape)

    def __getitem__(self, index):
        data = self.dataset[index]
        # retrieve values
        student_id = data['student_id']
        prompt_id = data['prompt_id']
        embeddings_question = torch.tensor(self.deserialize_array(data['embeddings_question'], np.float32, (768,))).to(self.device)
        embeddings_text = torch.tensor(self.deserialize_array(data['embeddings_text'], np.float32, (768,))).to(self.device)
        text = torch.tensor(self.deserialize_array(data['text'], np.float32, (768,))).to(self.device)
        content = torch.tensor(data['content']).to(self.device)
        wording = torch.tensor(data['wording']).to(self.device)
        normalized_lexical_density = torch.tensor(data['normalized_lexical_density']).unsqueeze(0).to(self.device)
        normalized_spell_checker = torch.tensor(data['normalized_spell_checker']).unsqueeze(0).to(self.device)
        normalized_tf_idf_question_score = torch.tensor(data['normalized_tf_idf_question_score']).unsqueeze(0).to(self.device)
        normalized_avg_word_length = torch.tensor(data['normalized_avg_word_length']).unsqueeze(0).to(self.device)
        normalized_smog_index = torch.tensor(data['normalized_smog_index']).unsqueeze(0).to(self.device)
        normalized_coleman_liau_index = torch.tensor(data['normalized_coleman_liau_index']).unsqueeze(0).to(self.device)
        normalized_flesch_reading_ease = torch.tensor(data['normalized_flesch_reading_ease']).unsqueeze(0).to(self.device)
        
        return {
            'student_id': student_id,
            'prompt_id': prompt_id,
            'embeddings_question': embeddings_question,
            'embeddings_text': embeddings_text,
            'text': text,
            'content': content,
            'wording': wording,
            'normalized_lexical_density': normalized_lexical_density,
            'normalized_spell_checker': normalized_spell_checker,
            'normalized_tf_idf_question_score': normalized_tf_idf_question_score,
            'normalized_avg_word_length': normalized_avg_word_length,
            'normalized_smog_index': normalized_smog_index,
            'normalized_coleman_liau_index': normalized_coleman_liau_index,
            'normalized_flesch_reading_ease': normalized_flesch_reading_ease,
        }

    def __len__(self) -> int :
        return self.dataset.num_rows

In [None]:
class DataLoaderFactory():

    def __init__(self, path:str = './data/hugging_face', batch_size = 12, device = 'cpu'):
        self.batch_size = batch_size
        self.dataset = load_from_disk(path, keep_in_memory=True)
        self.device = device

        print("1. Loading dataset: ...", end="")
        dataset = load_from_disk(path, keep_in_memory=True)
        print("\r1. Loading dataset: done ✔️")

        print("2. Split datasets: ...", end="")
        train_validation_splits = self.dataset['train'].train_test_split(test_size=0.2)
        print("\r2. Preprocess datasets: done ✔️")

        print("3. Split datasets: ...", end="")
        self.train_data = PromptDataset(train_validation_splits['train'], self.device)
        self.val_data = PromptDataset(train_validation_splits['test'], self.device)
        print("\r3. Split datasets: done ✔️")

        self.dataloader_train = DataLoader(self.train_data, batch_size=batch_size, shuffle=True)
        self.dataloader_val = DataLoader(self.val_data, batch_size=batch_size, shuffle=True)
    
    
    def __len__(self) -> int :
        print("\033[95m\033[1m\033[4mNumber of data by datasets splits\033[0m")
        print(f"Train\t\t: {len(self.train_data)}\t-> {len(self.train_data)/self.batch_size}")
        print(f"Validation\t: {len(self.val_data)}\t\t-> {len(self.val_data)/self.batch_size}")
        total = len(self.train_data) + len(self.val_data)
        print(f"Total\t\t: {total}")
        return total

    def get_batch(self, split):
        # choose the correct dataloader
        if split == 'train':
            dataloader = self.dataloader_train
        else:
            dataloader = self.dataloader_val

        for batch in dataloader:
            # Move tensors to device
            batch_on_device = {k: v for k, v in batch.items()}
            yield batch_on_device

### Load the dataset

In [None]:
dataset = DataLoaderFactory(device=device)
len(dataset)

### Testing the dataset

In [None]:
batch = dataset.get_batch('train')
nb = next(batch)

In [None]:
print(f"{'student_id:':<25}{len(nb['student_id'])}")
print(f"{'prompt_id:':<25}{len(nb['prompt_id'])}")
print(f"{'embeddings_question:':<25}{nb['embeddings_question'].shape}")
print(f"{'embeddings_text:':<25}{nb['embeddings_text'].shape}")
print(f"{'text:':<25}{nb['text'].shape}")
print(f"{'content:':<25}{nb['content'].shape}")
print(f"{'wording:':<25}{nb['wording'].shape}")
print(f"{'normalized_lexical_density:':<25}{nb['normalized_lexical_density'].shape}")
print(f"{'normalized_spell_checker:':<25}{nb['normalized_spell_checker'].shape}")
print(f"{'normalized_tf_idf_question_score:':<25}{nb['normalized_tf_idf_question_score'].shape}")
print(f"{'normalized_avg_word_length:':<25}{nb['normalized_avg_word_length'].shape}")
print(f"{'normalized_smog_index:':<25}{nb['normalized_smog_index'].shape}")
print(f"{'normalized_coleman_liau_index:':<25}{nb['normalized_coleman_liau_index'].shape}")
print(f"{'normalized_flesch_reading_ease:':<25}{nb['normalized_flesch_reading_ease'].shape}")

## Kaggle evaluation

### Paths

In [None]:
prompts_train_path = "./data/prompts_train.csv"
prompts_test_path = "./data/prompts_test.csv"

summaries_train_path = "./data/summaries_train.csv"
summaries_test_path = "./data/summaries_test.csv"

model_path = "./out/best_model_script.pt"

submission_path = "./out/submission.csv"

### Loading Model

In [None]:
exemple_batches = dataset.get_batch('train')
ex_batch = next(exemple_batches)
embeddings_question = ex_batch['embeddings_question']
embeddings_text = ex_batch['embeddings_text']
embeddings_answer = ex_batch['text']

content = ex_batch['content']
wording = ex_batch['wording']

normalized_lexical_density = ex_batch['normalized_lexical_density']
normalized_spell_checker = ex_batch['normalized_spell_checker']
normalized_tf_idf_question_score = ex_batch['normalized_tf_idf_question_score']
normalized_avg_word_length = ex_batch['normalized_avg_word_length']
normalized_smog_index = ex_batch['normalized_smog_index']
normalized_coleman_liau_index = ex_batch['normalized_coleman_liau_index']
normalized_flesch_reading_ease = ex_batch['normalized_flesch_reading_ease']

example_input_tensor = torch.cat(
    (
        embeddings_text,
        embeddings_question,
        embeddings_answer
    ), dim=1)
features = (normalized_lexical_density.to(device), \
            normalized_spell_checker.to(device), \
            normalized_tf_idf_question_score.to(device), \
            normalized_avg_word_length.to(device), \
            normalized_smog_index.to(device), \
            normalized_coleman_liau_index.to(device), \
            normalized_flesch_reading_ease.to(device) \
           )

In [14]:
print(embeddings_answer.shape)
for x in features:
    print(x.shape)

torch.Size([12, 768])
torch.Size([12, 1])
torch.Size([12, 1])
torch.Size([12, 1])
torch.Size([12, 1])
torch.Size([12, 1])
torch.Size([12, 1])
torch.Size([12, 1])


In [15]:
loaded_model = torch.jit.load(model_path).to(device)
loaded_model.eval()

RecursiveScriptModule(
  original_name=QA_Score_Model
  (fc_text_question_answer): RecursiveScriptModule(original_name=Linear)
  (fc_lexical): RecursiveScriptModule(original_name=Linear)
  (fc_spell): RecursiveScriptModule(original_name=Linear)
  (fc_tfidf): RecursiveScriptModule(original_name=Linear)
  (fc_avg_word_length): RecursiveScriptModule(original_name=Linear)
  (fc_smog): RecursiveScriptModule(original_name=Linear)
  (fc_coleman): RecursiveScriptModule(original_name=Linear)
  (fc_flesch): RecursiveScriptModule(original_name=Linear)
  (dropout): RecursiveScriptModule(original_name=Dropout)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (fc2): RecursiveScriptModule(original_name=Linear)
  (out_content): RecursiveScriptModule(original_name=Linear)
  (out_wording): RecursiveScriptModule(original_name=Linear)
)

In [None]:
# loaded_model(example_input_tensor.to(device))

In [17]:
# tensor1, tensor2 = loaded_model(example_input_tensor.to(device)), torch.stack((content, wording), dim=1)
tensor1, tensor2 = loaded_model(example_input_tensor.to('cpu'), features), torch.stack((content, wording), dim=1)

for (col1_tensor1, col2_tensor1), (col1_tensor2, col2_tensor2) in zip(tensor1, tensor2):
    print(f"[{col1_tensor1:>6.3f} => {col1_tensor2:>6.3f} = {col1_tensor2-col1_tensor1:>6.3f}] \t | \t [{col2_tensor1:>6.3f} => {col2_tensor2:>6.3f} = {col2_tensor2 - col2_tensor1:>6.3f}]")

[-0.196 => -0.249 = -0.053] 	 | 	 [-0.497 =>  0.383 =  0.880]
[-0.589 => -0.457 =  0.132] 	 | 	 [ 0.015 => -0.043 = -0.058]
[ 0.318 =>  0.376 =  0.058] 	 | 	 [ 0.166 =>  0.464 =  0.297]
[ 0.415 =>  1.076 =  0.662] 	 | 	 [-0.062 =>  0.074 =  0.136]
[-0.678 => -0.564 =  0.114] 	 | 	 [-0.740 =>  0.544 =  1.284]
[-0.209 => -0.393 = -0.184] 	 | 	 [-0.129 =>  0.627 =  0.756]
[ 0.842 =>  1.268 =  0.426] 	 | 	 [ 0.392 =>  0.579 =  0.187]
[ 1.542 =>  1.782 =  0.240] 	 | 	 [ 1.032 =>  0.912 = -0.120]
[-0.936 => -1.264 = -0.328] 	 | 	 [-0.899 => -1.505 = -0.606]
[-0.993 => -0.794 =  0.199] 	 | 	 [-1.179 => -1.545 = -0.366]
[-0.517 => -0.564 = -0.046] 	 | 	 [-0.789 => -1.127 = -0.338]
[-0.323 => -0.993 = -0.671] 	 | 	 [ 0.336 => -0.367 = -0.703]


### Load SentenceTransformer model

In [None]:
sentence_transformer_model = SentenceTransformer('all-mpnet-base-v2')

### Loading files

In [None]:
prompts_train = pd.read_csv(prompts_train_path)
prompts_test = pd.read_csv(prompts_test_path)

In [None]:
summaries_train = pd.read_csv(summaries_train_path)
summaries_test = pd.read_csv(summaries_test_path)

In [None]:
if len(prompts_test) == 2:
    prompts_test = prompts_train.copy()
    summaries_test = summaries_train.copy()

### Visualize datasets

In [None]:
prompts_test

In [None]:
summaries_test.head()

### Inference

### Normilized function
usefull to normalize between -1 and 1 content and wording

In [None]:
def normalize_value(x, min=-2, max=4):
    return 2*((x-min)/(max-min))-1

inverse normalization

In [None]:
# def rescale(value, old_min=-1, old_max=1, new_min=-2, new_max=4):
#     return ((value - old_min) / (old_max - old_min)) * (new_max - new_min) + new_min

### Helpers

In [None]:
def to_numpy(var):
    if isinstance(var, torch.Tensor):
        return var.cpu().numpy()
    return var

### Preprocess_prompts
We do this separatly in order to preprocess only on time each row

In [None]:
def preprocess_prompts(prompts_df: pd.DataFrame) -> pd.DataFrame:
    # init an empty new  dataFrame
    new_data = pd.DataFrame({
        'prompt_id': [],
        'embeddings_question': [],
        # 'prompt_title': [],
        'embeddings_text': [],
        'tf_idf': []
    })

    for index, row in prompts_df.iterrows():
        # retrieve columns
        prompt_id = row['prompt_id']
        prompt_question = row['prompt_question']
        # prompt_title = row['prompt_title'] # we do not need the title
        prompt_text = row['prompt_text']

        tf_idf = calc_tf_idf(prompt_question, prompt_text, 10)

        # we are creating a batch of the sentences we want to get embeddings
        sentences = [prompt_question, prompt_text]

        # calling model embedding
        embeddings = sentence_transformer_model.encode(sentences)

        # Create a new row
        new_row = pd.DataFrame({
            'prompt_id': [prompt_id],
            'embeddings_question': [embeddings[0]],
            'embeddings_text': [embeddings[1]],
            'tf_idf': [tf_idf],
            'prompt_question': [prompt_question],
            'prompt_text': [prompt_text]
        })
        
        # Append the row
        new_data = pd.concat([new_data.loc[:], new_row], ignore_index=True)

    return new_data

### Preprocess one row to create the input

In [None]:
def preprocess(row, prompts_df: pd.DataFrame):
    # retrieve columns from summaries_df
    prompt_id = row["prompt_id"]
    text = row["text"]

    # Data engineering
    # space behind period of end of sentence "Exemple end.Start new sentence text@gmail.com" = "Exemple end. Start new sentence text@gmail.com" 
    text = re.sub(r'(?<=\.)[A-Z]', r' \g<0>', text)
    text = Add_Periods(text)
    
    # FK_grade, Gunning_Fog, SMOG = readability_scores(text)
    smog_index = getattr(textstat, 'smog_index')(text)
    coleman_liau_index = getattr(textstat, 'coleman_liau_index')(text)
    flesch_reading_ease = getattr(textstat, 'flesch_reading_ease')(text)
    
    lexical_density = Lexical_Density(text)
    spell_checker = spell_checker_score(text)
    
    avg_word_length = average_word_length(text)

    # we are creating a batch of the sentences we want to get embeddings
    sentences = [row["text"]]

    # calling model embedding
    embeddings = sentence_transformer_model.encode(sentences)
    embeddings_answer = torch.tensor(embeddings[0])
    
    # retrieve usefull columns from preprocessed prompt
    prompt_row = prompts_df.loc[prompts_df['prompt_id'] == prompt_id]
    embeddings_question, embeddings_text = torch.tensor(prompt_row['embeddings_question'].item()), torch.tensor(prompt_row['embeddings_text'].item())
    
    # Calculating tf_idf for prompt question and prompt text in relation with answere text word present in the text
    tf_idf = [word.lower() for word in prompt_row['tf_idf'].item()]
    tf_idf_question = [word.lower() for word in calc_tf_idf(prompt_row["prompt_question"].item(), text, 10)]
    
    # Calculate the score representing the number of words from prompt_tf_idf present in answer_tf_idf
    tf_idf_question_score = sum(word in tf_idf_question for word in tf_idf)

    features = (
        torch.tensor([[normalize_value(lexical_density, min=0, max=1)]]).to(device), \
        torch.tensor([[normalize_value(spell_checker, min=0, max=10)]]).to(device), \
        torch.tensor([[normalize_value(tf_idf_question_score, min=0, max=10)]]).to(device), \
        torch.tensor([[normalize_value(avg_word_length, min=3, max=10)]]).to(device), \
        torch.tensor([[normalize_value(smog_index, min=0, max=17)]]).to(device), \
        torch.tensor([[normalize_value(coleman_liau_index, min=0, max=17)]]).to(device), \
        torch.tensor([[normalize_value(flesch_reading_ease, min=1, max=100)]]).to(device) \
       )
    # Create input
    input = torch.cat(
        (
            embeddings_text,
            embeddings_question,
            embeddings_answer
        ), dim=0).unsqueeze(0).to(device)

    return input, \
        features



### Inference

In [None]:
def inference(summaries_df: pd.DataFrame, prompts_df: pd.DataFrame):
    new_data = pd.DataFrame({
        'student_id': [],
        'content': [],
        # 'prompt_title': [],
        'wording': []
    })

    # preprocess prompt
    prompts_df = preprocess_prompts(prompts_df)
    # Iterate over summaries
    for index, row in summaries_df.iterrows():
        print(f"\r{index+1}/{len(summaries_df)}", end="")
        student_id = row['student_id']
        
        input, \
        features = preprocess(row, prompts_df)

        outputs = loaded_model(input, features)
        content, wording = outputs[0].detach().cpu().numpy()

        new_row = pd.DataFrame({'student_id': [student_id], 'content': [content], 'wording': [wording]})

        # Append the row
        new_data = pd.concat([new_data.loc[:], new_row], ignore_index=True)
    new_data.to_csv(submission_path, index=False)

In [None]:
inference(summaries_test, prompts_test)

## checking hist

In [None]:
import matplotlib.pyplot as plt

In [None]:
xmin, xmax = -2, 4
ymin, ymax = 0, 2000

In [None]:
df3['content'].hist(), df3['wording'].hist()

df3['content'].hist(label='Content', alpha=0.5, edgecolor='k')
df3['wording'].hist(label='Wording', alpha=0.5, edgecolor='k')
# Adding a legend
plt.legend()
plt.title('Error')
plt.show()

In [None]:
# Calculate the overall average of 'content' and 'wording' columns
avg_overall = df3[['content', 'wording']].mean().mean()
print(f"Overall Average: {avg_overall}")


In [None]:
submission_file = pd.read_csv(submission_path)

submission_file['content'].hist(label='Content', alpha=0.5, edgecolor='k')
submission_file['wording'].hist(label='Wording', alpha=0.5, edgecolor='k')
plt.xlim([xmin, xmax])  # Set x-axis limits
plt.ylim([ymin, ymax])  # Set y-axis limits
# Adding a legend
plt.legend()
plt.title('Prediction')
plt.show()

In [None]:
summaries_test['content'].hist(label='Content', alpha=0.5, edgecolor='k')
summaries_test['wording'].hist(label='Wording', alpha=0.5, edgecolor='k')
plt.xlim([xmin, xmax])  # Set x-axis limits
plt.ylim([ymin, ymax])  # Set y-axis limits
# Adding a legend
plt.legend()
plt.title('Real')
plt.show()