### ENTRAINEMENT D'UN MODELE  D'EMBEDDING (POUR LA CLASSIFICATION DES TEXTES):
-   on va partir d'une description textuelle d'une publication et par la suite faire une classification sur le dégré de succés de la vidéo en question.

In [1]:
# !pip install sentence-transformers
!pip install hf_xet



## How Sentence Transformers models work


In [1]:
from sentence_transformers import SentenceTransformer, models


# #Mean Pooling - Take attention mask into account for correct averaging
# def mean_pooling(model_output, attention_mask):
#     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
#     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
#     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

## Join steps 1 and 2 using the modules argument
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

  from .autonotebook import tqdm as notebook_tqdm


### PREPARATION OF THE DATASET

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
# Load the dataset

train_data=pd.read_csv("../dataset/train_val.csv")


In [3]:
#valeur maximale et minimale des valeurs de la colonne views
print("valeur minimale de la colonne views: ",train_data["views"].min())
print("valeur maximale de la colonne views: ",train_data["views"].max())
print("moyenne des valeurs de la colonne views: ",train_data["views"].mean())
print("médiane des valeurs de la colonne views: ",train_data["views"].median())

#segementation des valeurs en 10 classes en fonction du succès de la vidéo
bins = [0, 100, 1_000, 5_000, 10_000, 30_000,
        100_000, 1_000_000, 5_000_000, 100_000_000, 190_000_000]

labels = ["Double-digit views", "Triple-digit views", "1 K Views Club",
          "5 K Views Club", "10 K Views Club", "30 K + Zone",
          "100 K Club", "1 M Club", "Multi-Million", "100 M Club"]

train_data["success"] = pd.cut(train_data["views"], bins=bins, labels=labels,
                      right=True, include_lowest=True)

#suppression des lignes avec des valeurs manquantes
train_data.dropna(inplace=True)


valeur minimale de la colonne views:  0
valeur maximale de la colonne views:  190150188
moyenne des valeurs de la colonne views:  562777.6693579641
médiane des valeurs de la colonne views:  29294.5


In [4]:
train_data.head()

Unnamed: 0.1,Unnamed: 0,id,channel,title,date,description,views,year,success
0,0,--2s6hjGrm4,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI & VFX Breakdowns: ""Warzone"" - by Ramesh Th...",2020-12-15 05:00:01+00:00,"Check out this revealing VFX Breakdown ""Warzon...",12299,2020,10 K Views Club
1,1,--DnfroyKQ8,UC-1rx8j9Ggp8mp4uD0ZdEIA,"A Sci-Fi Short Film: ""Exit"" - by Ng King Kwan ...",2020-07-01 16:00:00+00:00,"TheCGBros Presents ""Exit"" by Ng King Kwan - Th...",7494,2020,5 K Views Club
2,2,--aiU7VQKEw,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI 3D Animated Short: ""Lost Love"" - by Akash ...",2019-02-18 20:30:00+00:00,"TheCGBros Presents ""Lost Love"" by Akash Manack...",11831,2019,10 K Views Club
3,6,-0SrlZAvSVM,UCW6NyJ6oFLPTnx7iGRZXDDg,Jo Goes Hunting - Careful | Animated music vid...,2020-03-10 14:30:01+00:00,"On the borderless map of a magical planet, lit...",2248,2020,1 K Views Club
4,10,-13Y2Pe7kFs,UC-1rx8j9Ggp8mp4uD0ZdEIA,"CGI VFX Breakdown: ""Logan (Wolverine): Digital...",2017-09-20 20:13:52+00:00,Check out this outstanding behind-the-scenes l...,113806,2017,100 K Club


In [5]:
from datasets import Dataset
# construction du dataset
dataset_frame=train_data[["title","success"]]

dataset=Dataset.from_pandas(dataset_frame)

#construction du dataset d'entrainement et de test
split=dataset.train_test_split(test_size=0.2,seed=42)
train_dataset=split["train"]
test_dataset=split["test"]


#construction du dataset d'entrainement et de validation
split=train_dataset.train_test_split(test_size=0.2,seed=42)
final_train_dataset=split["train"]
val_dataset=split["test"]


In [6]:
print(final_train_dataset)
print(val_dataset)
print(test_dataset)


Dataset({
    features: ['title', 'success', '__index_level_0__'],
    num_rows: 9712
})
Dataset({
    features: ['title', 'success', '__index_level_0__'],
    num_rows: 2428
})
Dataset({
    features: ['title', 'success', '__index_level_0__'],
    num_rows: 3035
})


## We will start by the constractive learning : 
This approach help to train a semantic embedding model,so that texts in the same class are “close” in embedding space, and texts from different classes are “far” apart, then you use triplet loss or similar. \
Each triplet:

- Anchor: A sample from class X.
- Positive: Another sample from (the same) class X.
- Negative: A sample from a different class Y (Y ≠ X).

In [7]:
from sentence_transformers import InputExample, losses
import random
from collections import defaultdict
#convert labels to index
label_to_indices=defaultdict(list)

for i,row in enumerate(final_train_dataset):
    label_to_indices[row["success"]].append(i)

train_examples=[]
for anchor_idx, anchor in enumerate(final_train_dataset):
    anchor_label = anchor["success"]
    anchor_desc = anchor["title"]

    #candidat positif
    positive_idx=random.choice(label_to_indices[anchor_label]) if len(label_to_indices[anchor_label])>1 else anchor_idx
    positive_desc=final_train_dataset[positive_idx]["title"]

    #candidat négatif
    negative_idx=random.choice(label_to_indices[anchor_label]) if len(label_to_indices[anchor_label])>1 else anchor_idx
    negative_desc=final_train_dataset[negative_idx]["title"]

    train_examples.append(InputExample(texts=[anchor_desc,positive_desc,negative_desc]))



print(f"Number of triplets: {len(train_examples)}")

Number of triplets: 9712


In [8]:
print(train_examples[0])


<InputExample> label: 0, texts: CGI 3D Animated Short: "Lost Love" - by Akash Manackchand | TheCGBros; CGI VFX Animated TVC : "SIDI ALI: Dancing Robot" by Digital Golem; CGI 3D Animated Shorts : "Milezim" - by ESMA


We wrap our training dataset into a Pytorch `Dataloader` to shuffle examples and get batch sizes.

In [9]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)

## INFERENCE BEFORE TRAINING

In [10]:
from sentence_transformers.util import cos_sim
def inference(model, list_title_dict):
    """
    list_title_dict=[
    {
    "title":"title 1",
    "success":"success 1"
    },
    {
    "title":"title 2",
    "success":"success 2"
    }
    """
    list_title=[value["title"] for value in list_title_dict]
    embedding=model.encode(list_title , convert_to_tensor=True)
    print(embedding.shape)
    similarity=cos_sim(embedding[0],embedding[1])
    print("similarity between the two titles: ",similarity)
    print(" the label of the first title is: ",list_title_dict[0]["success"])
    print(" the label of the second title is: ",list_title_dict[1]["success"])
    return similarity

list_title_dict=[
    {
        "title":train_data["title"][0],
        "success":train_data["success"][0]
    },
    {
        "title":train_data["title"][1],
        "success":train_data["success"][1]
    }
]
inference(model,list_title_dict)

torch.Size([2, 384])
similarity between the two titles:  tensor([[0.3063]], device='cuda:0')
 the label of the first title is:  10 K Views Club
 the label of the second title is:  5 K Views Club


tensor([[0.3063]], device='cuda:0')

## Loss functions for training a Sentence Transformers model


In [11]:
from sentence_transformers import losses

train_loss = losses.TripletLoss(model=model)

In [12]:
from sentence_transformers import LoggingHandler, SentenceTransformer, losses
import logging , os 
from datasets import Dataset

logging.basicConfig(
    format="%(asctime)s -   %(message)s",
    level=logging.INFO,
    handlers=[LoggingHandler()]
)
logger=logging.getLogger(__name__)

num_epochs = 50

warmup_steps = int(len(train_dataloader) * num_epochs * 0.1) #10% of train data

model.fit(train_objectives=[(train_dataloader, train_loss)],
          epochs=num_epochs,
          warmup_steps=warmup_steps,)


[34m[1mwandb[0m: Currently logged in as: [33mdehayemkenfouo[0m ([33mdehayemkenfouo-st[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,5.0003
1000,4.9955
1500,4.9898
2000,4.9741
2500,4.9602
3000,4.9196
3500,4.8714
4000,4.8077
4500,4.7834
5000,4.7543


In [13]:
list_title_dict=[
    {
        "title":train_data["title"][30],
        "success":train_data["success"][30]
    },
    {
        "title":train_data["title"][10],
        "success":train_data["success"][10]
    }
]
inference(model,list_title_dict)

Batches: 100%|██████████| 1/1 [00:00<00:00, 274.05it/s]

torch.Size([2, 384])
similarity between the two titles:  tensor([[1.0000]], device='cuda:0')
 the label of the first title is:  30 K + Zone
 the label of the second title is:  100 K Club





tensor([[1.0000]], device='cuda:0')

In [14]:
print(train_dataset["title"][6])
print(train_dataset["success"][6])

Papa Aap Sunn Rahe Hai Na? | Heart Touching Short Film Hindi | @SocialFootage
1 K Views Club


In [15]:
print(train_dataset["title"][0])
print(train_dataset["success"][0])

CGI 3D Animated Branding Vignettes: "Verizon Animations" - by AssemblyLTD
5 K Views Club


In [None]:
!huggingface-cli login
# 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

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)



    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To log in, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File [35m"/users/eleves-b/2023/sy

In [16]:
model.push_to_hub(
    "all-MiniLM-L6-v2_embedder_train", 
    # organization="embedding-data",
    # train_datasets=["embedding-data/QQP_triplets"],
    exist_ok=True, 
    )

2025-05-05 23:51:13,040 -   Save model to /tmp/tmpo6ph2aqm


model.safetensors: 100%|██████████| 90.9M/90.9M [00:05<00:00, 16.1MB/s]


'https://huggingface.co/Syldehayem/all-MiniLM-L6-v2_embedder_train/commit/252863ccc854c62c3fd77ff712cd59e976eba7bb'