<a href="https://colab.research.google.com/github/zayedupal/Hugging_Face_Movie_Genre_Prediction_Public/blob/main/HF_movie_genre_flan_t5_embedding_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets evaluate accelerate scikit-learn torch sentencepiece

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m59.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.0-py3-none-any.whl (492 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m492.2/492.2 kB[0m [31m43.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting accelerate
  Downloading accelerate-0.21.0-py3-none-any.whl (244 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.2/244.2 kB[0m [31m25.5 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
import time

import numpy as np
import pandas as pd
import torch.nn.functional as F
from datasets import load_dataset
import torch
from torch import Tensor
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, T5EncoderModel


def preprocess_dataset(data):
    def create_text(x):
        x['text'] = x['movie_name'] + ': ' + x['synopsis']
        x['label'] = x['genre']
        return x

    data = data.map(create_text)
    print(f"preprocessed data: {data}")
    return data


def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]


# def generate_embeddings(input_texts, tokenizer, model, device):
#     # Tokenize the input texts
#     # batch_dict = tokenizer(input_texts, max_length=512, padding=True,
#     #                        truncation=True, return_tensors='pt')
#     embeddings = []
#     torch.cuda.empty_cache()
#     for text in input_texts:
#       tokens = tokenizer(text, max_length=512, return_tensors='pt', truncation=True)['input_ids'].to(device)

#       embedding = model(input_ids=tokens)
#       # embedding = embedding
#       embeddings.append(embedding)
#       del embedding

#       # (Optionally) normalize embeddings
#       # embeddings = F.normalize(embeddings, p=2, dim=1)

#     return embeddings
#     # return embeddings

def generate_embeddings(input_texts, tokenizer, model, device):
    # Tokenize the input texts
    batch_dict = tokenizer(input_texts, max_length=512, padding=True,
                           truncation=True, return_tensors='pt').to(device)

    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    # (Optionally) normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)

    return embeddings.cpu().detach().numpy()
    # return embeddings

def generate_embeddings_batch(input_texts, model, tokenizer, device, batch_size=1):
    embeddings = None
    batch_count = len(input_texts) // batch_size
    for i in tqdm(range(batch_count)):
        cur_embeddings = generate_embeddings(input_texts[i * batch_size: (i + 1) * batch_size], tokenizer, model, device=device)
        embeddings = np.append(embeddings, cur_embeddings, axis=0) if (embeddings is not None) else cur_embeddings
        del cur_embeddings
        # print(f"embedding completed: {i/batch_count}")

    if len(input_texts) > batch_count * batch_size:
        cur_embeddings = generate_embeddings(input_texts[batch_count * batch_size:], tokenizer, model, device=device)
        embeddings = np.append(embeddings, cur_embeddings, axis=0) if (embeddings is not None) else cur_embeddings
        del cur_embeddings

    print(f'len(embeddings): {len(embeddings)}')
    print(f'len(input_texts): {len(input_texts)}')

    return embeddings

def create_embedding_df(data, embeddings):
    print(f'embeddings: {embeddings.shape}')
    # print(f'embeddings flatten: {embeddings.flatten().shape}')
    embeddings_df = pd.DataFrame({'embeddings': embeddings.tolist()})
    embeddings_df['id'] = data['id']
    # embeddings_df['embeddings'] = embeddings
    embeddings_df['label'] = data['label']

    return embeddings_df

def generate_embedding_files_from_HF_dataset(model, tokenizer,
                                             dataset, train_emb_output, test_emb_output, device,
                                             train_split_percent=100, test_split_percent=100):
    train_data = load_dataset(dataset, use_auth_token=True, split=f'train[:{train_split_percent}%]')
    test_data = load_dataset(dataset, use_auth_token=True, split=f'test[:{test_split_percent}%]')

    print(f'train_data: {len(train_data)}')
    print(f'test_data: {len(test_data)}')

    train_data = preprocess_dataset(train_data)
    test_data = preprocess_dataset(test_data)

    train_embeddings = generate_embeddings_batch(train_data['text'], model, tokenizer, device=device)
    create_embedding_df(train_data, train_embeddings).to_csv(train_emb_output, index=False)
    del train_embeddings

    test_embeddings = generate_embeddings_batch(test_data['text'], model, tokenizer, device=device)
    create_embedding_df(test_data, test_embeddings).to_csv(test_emb_output, index=False)
    del test_embeddings

In [None]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# device.reset()

In [None]:
# MAIN
start_time = time.time()

model_name = "google/flan-t5-xxl"
formatted_model_name = model_name.replace("/","_")

model = T5EncoderModel.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer = tokenizer.to(device)


generate_embedding_files_from_HF_dataset(
    model=model,
    tokenizer=tokenizer,
    dataset="datadrivenscience/movie-genre-prediction",
    train_emb_output=f"/content/drive/MyDrive/Colab Notebooks/data/train_embeddings_{formatted_model_name}.csv",
    test_emb_output=f"/content/drive/MyDrive/Colab Notebooks/data/test_embeddings_{formatted_model_name}.csv",
    device=device,
    train_split_percent=100,
    test_split_percent=100
)

print(f"Runtime: {time.time() - start_time}")

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



Downloading readme:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/7.16M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/4.74M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/54000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/36000 [00:00<?, ? examples/s]

train_data: 54000
test_data: 36000


Map:   0%|          | 0/54000 [00:00<?, ? examples/s]

preprocessed data: Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre', 'text', 'label'],
    num_rows: 54000
})


Map:   0%|          | 0/36000 [00:00<?, ? examples/s]

preprocessed data: Dataset({
    features: ['id', 'movie_name', 'synopsis', 'genre', 'text', 'label'],
    num_rows: 36000
})


100%|██████████| 54000/54000 [3:03:18<00:00,  4.91it/s]


len(embeddings): 54000
len(input_texts): 54000
embeddings: (54000, 4096)


100%|██████████| 36000/36000 [1:11:11<00:00,  8.43it/s]


len(embeddings): 36000
len(input_texts): 36000
embeddings: (36000, 4096)
Runtime: 15990.029606103897
