<a href="https://colab.research.google.com/github/zayedupal/Hugging_Face_Movie_Genre_Prediction_Public/blob/main/HF_movie_genre_sentence_transformer_embedding_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install sentence-transformers datasets evaluate accelerate torch

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.13.1-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.2/486.2 kB[0m [31m45.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m227.6/227.6 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
Collecting transformers<5.0.0,>=4.6.0 (from sentence-transformers)
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import time

import numpy as np
import pandas as pd
import torch.nn.functional as F
from datasets import load_dataset
from torch import Tensor
from tqdm import tqdm
from sentence_transformers import SentenceTransformer


def preprocess_dataset(data):
    def create_text(x):
        x['text'] = x['movie_name'] + ': ' + x['synopsis']
        x['label'] = x['genre']
        return x

    data = data.map(create_text)
    print(f"preprocessed data: {data}")
    return data


def generate_embeddings(input_texts, model):
    embeddings = model.encode(input_texts)

    return embeddings
    # return embeddings


def generate_embeddings_batch(input_texts, model, batch_size=50):
    embeddings = None
    batch_count = len(input_texts) // batch_size
    for i in tqdm(range(batch_count)):
        cur_embeddings = generate_embeddings(input_texts[i * batch_size: (i + 1) * batch_size], model)
        embeddings = np.append(embeddings, cur_embeddings, axis=0) if (embeddings is not None) else cur_embeddings
        del cur_embeddings
        # print(f"embedding completed: {i/batch_count}")

    if len(input_texts) > batch_count * batch_size:
        cur_embeddings = generate_embeddings(input_texts[batch_count * batch_size:], model)
        embeddings = np.append(embeddings, cur_embeddings, axis=0) if (embeddings is not None) else cur_embeddings
        del cur_embeddings

    print(f'len(embeddings): {len(embeddings)}')
    print(f'len(input_texts): {len(input_texts)}')

    return embeddings

def create_embedding_df(data, embeddings):
    print(f'embeddings: {embeddings.shape}')
    # print(f'embeddings flatten: {embeddings.flatten().shape}')
    embeddings_df = pd.DataFrame({'embeddings': embeddings.tolist()})
    embeddings_df['id'] = data['id']
    # embeddings_df['embeddings'] = embeddings
    embeddings_df['label'] = data['label']

    return embeddings_df

def generate_embedding_files_from_HF_dataset(model,
                                             dataset, train_emb_output, test_emb_output,
                                             train_split_percent=100, test_split_percent=100):
    train_data = load_dataset(dataset, use_auth_token=True, split=f'train[:{train_split_percent}%]')
    test_data = load_dataset(dataset, use_auth_token=True, split=f'test[:{test_split_percent}%]')

    print(f'train_data: {len(train_data)}')
    print(f'test_data: {len(test_data)}')

    train_data = preprocess_dataset(train_data)
    test_data = preprocess_dataset(test_data)

    train_embeddings = generate_embeddings_batch(train_data['text'], model)
    create_embedding_df(train_data, train_embeddings).to_csv(train_emb_output, index=False)
    del train_embeddings

    test_embeddings = generate_embeddings_batch(test_data['text'], model)
    create_embedding_df(test_data, test_embeddings).to_csv(test_emb_output, index=False)
    del test_embeddings

In [None]:
# MAIN
start_time = time.time()

model_name = "sentence-transformers/gtr-t5-xxl"
formatted_model_name = model_name.replace("/","_")


model = SentenceTransformer(model_name)

generate_embedding_files_from_HF_dataset(
    model=model,
    dataset="datadrivenscience/movie-genre-prediction",
    train_emb_output=f"/content/drive/MyDrive/Colab Notebooks/train_embeddings_{formatted_model_name}.csv",
    test_emb_output=f"/content/drive/MyDrive/Colab Notebooks/test_embeddings_{formatted_model_name}.csv",
    train_split_percent=100,
    test_split_percent=100
)

print(f"Runtime: {time.time() - start_time}")

Downloading (…)9313b/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)/2_Dense/config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/3.15M [00:00<?, ?B/s]

Downloading (…)edce49313b/README.md:   0%|          | 0.00/1.89k [00:00<?, ?B/s]

Downloading (…)ce49313b/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/9.73G [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)9313b/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading (…)e49313b/modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/datadrivenscience___parquet/datadrivenscience--movie-genre-prediction-01acd85570f2b187/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/54000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/36000 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/datadrivenscience___parquet/datadrivenscience--movie-genre-prediction-01acd85570f2b187/0.0.0/14a00e99c0d15a23649d0db8944380ac81082d4b021f398733dd84f3a6c569a7. Subsequent calls will reuse this data.




train_data: 54000
test_data: 36000


Map:   0%|          | 0/54000 [00:00<?, ? examples/s]

preprocessed data: Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre', 'text', 'label'],
    num_rows: 54000
})


Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

preprocessed data: Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre', 'text', 'label'],
    num_rows: 36000
})


100%|██████████| 1080/1080 [36:42<00:00,  2.04s/it]


len(embeddings): 54000
len(input_texts): 54000
embeddings: (54000, 768)


100%|██████████| 720/720 [24:10<00:00,  2.02s/it]


len(embeddings): 36000
len(input_texts): 36000
embeddings: (36000, 768)
Runtime: 4428.202921390533
