# Imports|

In [None]:
# import torch
# if torch.backends.mps.is_available():
#     mps_device = torch.device("mps")
#     x = torch.ones(1, device=mps_device)
#     print (x)
# else:
#     print ("MPS device not found.")

In [None]:
import os
os.environ["PYTORCH_MPS_HIGH_WATERMARK_RATIO"] = "0.0"

pytorch_mps_high_watermark_ratio = os.environ.get("PYTORCH_MPS_HIGH_WATERMARK_RATIO")
if pytorch_mps_high_watermark_ratio is not None:
    print(f"PYTORCH_MPS_HIGH_WATERMARK_RATIO = {pytorch_mps_high_watermark_ratio}")
else:
    print("PYTORCH_MPS_HIGH_WATERMARK_RATIO is not set.")

In [None]:

import pandas as pd
import numpy as np
import ast
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from transformers import AutoModelForSequenceClassification, AutoTokenizer, DataCollatorWithPadding, PreTrainedTokenizerFast, Trainer, TrainingArguments
from datasets import Dataset
import torch
from torch import nn
import evaluate

## Download data from Kaggle

In [None]:
# download dataset via Kaggle API
# https://towardsdatascience.com/how-to-search-and-download-data-using-kaggle-api-f815f7b98080
# os.system(
#     "kaggle datasets download -d rounakbanik/the-movies-dataset -p ./data/ --unzip"
# )

In [None]:
df = pd.read_csv("./data/movies_metadata.csv", usecols=["overview", "genres"])

In [None]:
df["genres"] = df["genres"].apply(ast.literal_eval)

In [None]:
df = df.loc[df["genres"].apply(lambda x: len(x) > 0)]

In [None]:
df_exp = df.explode("genres")
df_exp = pd.concat([df_exp[["overview"]], df_exp["genres"].apply(pd.Series)], axis=1).reset_index(names=["movie_id"]).reset_index(names=["input_id"])
# df_exp = df_exp.groupby(["movie_id"]).first().reset_index()

In [None]:
df_genre_cnts = df_exp.groupby(["id", "name"]).size().to_frame("cnt").query("cnt > 1").reset_index().sort_values(by="id")
df_genre_cnts["label"] = df_genre_cnts.reset_index().index
df_exp = df_exp.merge(df_genre_cnts[["id", "label"]], on="id", how="inner")

In [None]:
mov_train, mov_test = train_test_split(df_exp["movie_id"], test_size=0.2, stratify=df_exp["id"], random_state=42)

In [None]:
dct_rename = {"overview": "text"}
train_data = df_exp.loc[df_exp["movie_id"].isin(mov_train), ["label", "overview"]].rename(columns=dct_rename)#.to_dict(orient="dict")
test_data = df_exp.loc[df_exp["movie_id"].isin(mov_test), ["label", "overview"]].rename(columns=dct_rename)#.to_dict(orient="dict")

In [None]:
train_data_sample = train_data.sample(20000)
test_data_sample = test_data.sample(5000)

In [None]:
model_label = "distilbert-base-uncased"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_label, fast_tokenizer=True)
# fast_tokenizer = PreTrainedTokenizerFast(tokenizer)

In [None]:
# def tokenize_function(examples):
#     result = tokenizer(
#         [str(txt) for txt in examples["text"]], 
#         padding="max_length", 
#         truncation=True, 
#         max_length=512, 
#         return_overflowing_tokens=True
#     )
    
#     sample_map = result.pop("overflow_to_sample_mapping")
#     for key, values in examples.items():
#         result[key] = np.array([values[i] for i in sample_map])
#     return result
# Creating a function for tokenization
def tokenize_function(examples):
   return tokenizer([str(txt) for txt in examples["text"]], truncation=True, max_length=512, padding="max_length")

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
# train_dataset = Dataset.from_pandas(train_data_sample)
# test_dataset = Dataset.from_pandas(test_data_sample)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)

In [None]:
train_dataset_tokenized = train_dataset.map(tokenize_function, batched=True)
test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)

In [None]:
train_dataset_tokenized.to_pandas().input_ids.apply(len).hist()

In [None]:
device = torch.device("mps") if torch.backends.mps.is_available() else torch.device("cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_label, num_labels=df_genre_cnts.shape[0])
model = model.to(device)

In [None]:
training_args = TrainingArguments(
    output_dir="test_trainer", 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    # learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    # weight_decay=0.01,
    load_best_model_at_end=True,
    use_mps_device=True
)

In [None]:
metric = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
pct = train_data["label"].value_counts(normalize=True).sort_index()
class_weights = (pct.mean() / pct).astype(np.float32).to_numpy()

In [None]:
# # Compute metrics
# # Source: https://medium.com/cometheartbeat/building-a-text-classifier-app-with-hugging-face-bert-and-comet-278e4cd0d0aa (Step 6)
# # Indexing to example function
# # def get_example(index):
# #   return test_dataset_tokenized[index]["text"]

# compute custom loss (suppose one has 3 labels with different weights)
def weighted_cross_entropy_loss(labels, logits, class_weights):
        loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, device=model.device))
        loss = loss_fct(logits.view(-1, class_weights.shape[0]), labels.view(-1))
        return loss

# # Creating a function to compute metrics
# def compute_metrics(pred):
#     # experiment = comet_ml.get_global_experiment()

#     labels = pred.label_ids
#     logits = pred.predictions
#     preds = logits.argmax(-1)
#     precision, recall, f1, _ = precision_recall_fscore_support(
#         labels, preds, average="macro"
#     )
#     acc = accuracy_score(labels, preds)
#     # cross_entropy = weighted_cross_entropy_loss(labels=labels, logits=logits, class_weights=class_weights)

#     # if experiment:
#     #     epoch = int(experiment.curr_epoch) if experiment.curr_epoch is not None else 0
#     #     experiment.set_epoch(epoch)
#     #     experiment.log_confusion_matrix(
#     #         y_true=labels,
#     #         y_predicted=preds,
#     #         file_name=f"confusion-matrix-epoch-{epoch}.json",
#     #         labels=["negative", "positive"],
#     #         index_to_example_function=get_example,
#     #     )

#     return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

In [None]:
# CustomTrainer for imbalanced dataset
# Source: https://huggingface.co/docs/transformers/main_classes/trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss = weighted_cross_entropy_loss(labels=labels, logits=logits, class_weights=class_weights)
        return (loss, outputs) if return_outputs else loss

In [None]:
# problems with memory here:
# https://github.com/AUTOMATIC1111/stable-diffusion-webui/issues/9133
# solution: restart the notebook everytime after it crashes?
# model = model.to("mps")
trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset_tokenized,
    eval_dataset=test_dataset_tokenized,
    compute_metrics=compute_metrics,
    # data_collator=data_collator
)

In [None]:
trainer.train()

In [None]:
trainer.save_model("model")

In [None]:
# test_dataset_tokenized = test_dataset.map(tokenize_function, batched=True)
test_predictions = trainer.predict(test_dataset_tokenized)

In [None]:
# my (correct) solution:
def softmax(z):
    assert len(z.shape) == 2
    s = np.max(z, axis=1)
    s = s[:, np.newaxis] # necessary step to do broadcasting
    e_x = np.exp(z - s)
    div = np.sum(e_x, axis=1)
    div = div[:, np.newaxis] # dito
    return e_x / div

preds = softmax(test_predictions.predictions)

In [None]:
import matplotlib.pyplot as plt
plt.hist(preds[:,3])

In [None]:
np.unique(preds.argmax(1), return_counts=True)

In [None]:
preds

In [None]:
pd.crosstab(index=pd.Series(preds.argmax(axis=1)), columns=test_data_sample.label.reset_index(drop=True))

In [None]:
pd.crosstab(index=pd.Series(preds.argmax(axis=1)), columns=test_data_sample.label.reset_index(drop=True), normalize=0, margins=True)

In [None]:
# test_movie_tokenized = tokenizer(
#     test_movie, 
#     padding="max_length", 
#     truncation=True,   
#     max_length=512, 
#     return_overflowing_tokens=True
# )
def get_prediction(test_movie):
    test_movie_dataset = Dataset.from_pandas(pd.DataFrame({"text": [str(test_movie)], "label": [0]}))
    test_movie_tokenized = test_movie_dataset.map(tokenize_function, batched=True)
    outputs = trainer.predict(test_movie_tokenized)
    probs = softmax(outputs.predictions)
    return df_genre_cnts[["label", "name"]].assign(probs=probs[0]).sort_values(by="probs", ascending=False)

In [None]:
# test_movie = "A teenage boy with a sex therapist mother teams up with a high school classmate to set up an underground sex therapy clinic at school."
test_movie = "An orphaned boy enrolls in a school of wizardry, where he learns the truth about himself, his family and the terrible evil that haunts the magical world."
# test_movie = "After uncovering a mysterious artifact buried beneath the Lunar surface, a spacecraft is sent to Jupiter to find its origins: a spacecraft manned by two men and the supercomputer HAL 9000."
get_prediction(test_movie)