**Collegamento Drive**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Prima versione del dataset** *(non utilizzare)*

In [None]:
```python
# Import necessary libraries
import pandas as pd
from google.colab import files

# Load data from CSV files
tags = pd.read_csv('/content/tags.csv')
movies = pd.read_csv('/content/movies.csv')

# Merge the 'movies' and 'tags' DataFrames based on the "movieId" column
movies_tags = pd.merge(movies, tags, on="movieId")

# Calculate the top 15 most frequent tags in the combined DataFrame
top_classes_full = movies_tags["tag"].value_counts().head(15)

# Define a list of desired tags for task classification
desired_tags = ["action", "funny", "romance", "comedy"]

# Filter the 'movies_tags' DataFrame to contain only rows with the desired tags
filtered_mt = movies_tags[movies_tags['tag'].isin(desired_tags)]

# Remove unnecessary columns ('movieId', 'userId', 'timestamp')
filtered_mt = filtered_mt.drop(labels=["movieId", "userId", "timestamp"], axis=1)

# Remove duplicate rows
filtered_mt = filtered_mt.drop_duplicates()

# Reset the indices of the resulting DataFrame
filtered_mt = filtered_mt.reset_index(drop=True)

# Save the filtered DataFrame to a new CSV file
filtered_mt.to_csv('filtered_mt.csv', index=False)

# Download the created CSV file
files.download('filtered_mt.csv')
print("------------------------------------------------------------------")

# Print the resulting DataFrame
print("Filtered and merged dataset:\n", filtered_mt)
```

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

------------------------------------------------------------------
Filtered and merged dataset:
                              title  \
0                 Toy Story (1995)   
1                 Toy Story (1995)   
2                 Toy Story (1995)   
3          Grumpier Old Men (1995)   
4                      Heat (1995)   
...                            ...   
4541      The Addams Family (2019)   
4542           Private Road (1971)   
4543  Terminator: Dark Fate (2019)   
4544            White Snake (2019)   
4545            White Snake (2019)   

                                           genres      tag  
0     Adventure|Animation|Children|Comedy|Fantasy   comedy  
1     Adventure|Animation|Children|Comedy|Fantasy    funny  
2     Adventure|Animation|Children|Comedy|Fantasy   action  
3                                  Comedy|Romance    funny  
4                           Action|Crime|Thriller   action  
...                                           ...      ...  
4541     Animation|

In [None]:
import pandas as pd

#uploaded = files.upload()
#for filename in uploaded.keys():
#    print(f'File caricato: "{filename}", dimensione: {len(uploaded[filename])} byte')

# Load the CSV file containing the filtered dataset
df = pd.read_csv('/content/filtered_mt.csv')

# Remove production years in parentheses from the 'title' column
df['title'] = df['title'].str.replace(r'\(\d{4}\)', '', regex=True)

# Combine the 'title' and 'genres' columns into a new 'texts' column
df['texts'] = df['title'] + ' ' + df['genres']

# Rename the 'tag' column to 'labels'
df = df.rename(columns={'tag': 'labels'})

# Define the desired tags and create a numerical mapping for them
desired_tags = ["action", "funny", "romance", "comedy"]
tag_mapping = {tag: str(i) for i, tag in enumerate(desired_tags)}

# Map the tags in the 'labels' column using the created mapping
df['labels'] = df['labels'].map(tag_mapping)

# Select only the 'texts' and 'labels' columns from the DataFrame
df = df.iloc[:, 2:4]

# Save the resulting DataFrame to a new CSV file
df.to_csv('final_dataset.csv', index=False)

# Download the created CSV file
files.download('final_dataset.csv')

# Print the resulting DataFrame
print(df)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

     labels                                              texts
0         3  Toy Story  Adventure|Animation|Children|Comedy...
1         1  Toy Story  Adventure|Animation|Children|Comedy...
2         0  Toy Story  Adventure|Animation|Children|Comedy...
3         1                   Grumpier Old Men  Comedy|Romance
4         0                        Heat  Action|Crime|Thriller
...     ...                                                ...
4541      1  The Addams Family  Animation|Children|Comedy|F...
4542      2                                Private Road  Drama
4543      0               Terminator: Dark Fate  Action|Sci-Fi
4544      0             White Snake  Animation|Fantasy|Romance
4545      2             White Snake  Animation|Fantasy|Romance

[4546 rows x 2 columns]


In [None]:
# Import libraries, in particular from transformer libraries we import BertTokenizer

import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW

# Read a final dataset preprocessed before
df = pd.read_csv("/content/final_dataset.csv")
# We separate the text and the labels from dataset
texts = df['texts'].tolist()
labels = df['labels'].tolist()

**BERT classic (v1)**, dropout=0.1

In [None]:
from torch import nn
from transformers import BertModel


# We Define a BERT-based classifier class that inherits from the nn.Module in PyTorch
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        # Initialize the class, inheriting from the nn.Module
        super(BERTClassifier, self).__init__()

        # Load the pre-trained BERT model using the specified name
        self.bert = BertModel.from_pretrained(bert_model_name)

        # Define a dropout layer with a dropout rate of 0.1
        self.dropout = nn.Dropout(0.1)

        # Define a fully connected layer (linear layer) with the output size being the BERT hidden size and the number of classes
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        # Define the forward pass of the model
        # Forward pass takes input_ids (tokenized input) and attention_mask (attention mask for padding)

        # Pass the input_ids and attention_mask through the BERT model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Extract the pooled output from the BERT model
        pooled_output = outputs.pooler_output

        # Apply dropout to the pooled output
        x = self.dropout(pooled_output)

        # Pass the result through the fully connected layer
        logits = self.fc(x)

        # Return the logits (output scores) from the model
        return logits


**BERT con più FCL (v2)**

In [None]:
from torch import nn
from transformers import BertModel

# In this version of Bert we define a Dropout equals to zero and we introduce 2 fully-connected layers
class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.0)
        self.fc1 = nn.Linear(self.bert.config.hidden_size, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits1 = self.fc1(x)
        logits2 = self.fc2(logits1)
        return logits2

**BERT con dropout rate a 0.5 (v3)**

In [None]:
from torch import nn
from transformers import BertModel


class BERTClassifier(nn.Module):
    def __init__(self, bert_model_name, num_classes):
        super(BERTClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(self.bert.config.hidden_size, num_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.pooler_output
        x = self.dropout(pooled_output)
        logits = self.fc(x)
        return logits

**Training, evaluation e prediction functions**

In [None]:
import torch
from sklearn.metrics import accuracy_score, classification_report
from torch import nn

# Define a function for training the model
def train(model, data_loader, optimizer, scheduler, device):
    # Set the model to training mode
    model.train()

    # Iterate over batches in the data loader
    for batch in data_loader:
        # Zero out the gradients
        optimizer.zero_grad()

        # Move input data to the specified device (e.g., GPU)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Calculate the cross-entropy loss
        loss = nn.CrossEntropyLoss()(outputs, labels)

        # Backward pass and optimization step
        loss.backward()
        optimizer.step()

        # Adjust the learning rate based on the scheduler
        scheduler.step()

# Define a function for evaluating the model
def evaluate(model, data_loader, device):
    # Set the model to evaluation mode
    model.eval()

    # Initialize lists to store predictions and actual labels
    predictions = []
    actual_labels = []

    # Disable gradient computation during evaluation
    with torch.no_grad():
        # Iterate over batches in the data loader
        for batch in data_loader:
            # Move input data to the specified device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device, dtype=torch.long)

            # Forward pass through the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)

            # Get the predicted labels and store them
            _, preds = torch.max(outputs, dim=1)
            predictions.extend(preds.cpu().tolist())

            # Store the actual labels
            actual_labels.extend(labels.cpu().tolist())

    # Calculate accuracy and generate classification report
    return accuracy_score(actual_labels, predictions), classification_report(actual_labels, predictions)

# Define a function for making predictions with the model
def prediction(text, model, tokenizer, device, max_length=512):
    # Set the model to evaluation mode
    model.eval()

    # Tokenize the input text and create input tensors
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Disable gradient computation during prediction
    with torch.no_grad():
        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the predicted labels
        _, preds = torch.max(outputs, dim=1)

    return preds

**Custom dataset class**

In [None]:
import torch
from torch.utils.data import Dataset

# da adattare
class MovieDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        encoding = self.tokenizer(text, return_tensors='pt', padding='max_length', truncation=True)
        return {'input_ids': encoding['input_ids'].flatten(), 'attention_mask': encoding['attention_mask'].flatten(), 'label': torch.tensor(int(label))}


**Import new dataset**

In [None]:
import pandas as pd

data = pd.read_csv('/content/drive/MyDrive/Database/mpst_full_data.csv')
print(data)

         imdb_id                                          title  \
0      tt0057603                        I tre volti della paura   
1      tt1733125  Dungeons & Dragons: The Book of Vile Darkness   
2      tt0033045                     The Shop Around the Corner   
3      tt0113862                             Mr. Holland's Opus   
4      tt0086250                                       Scarface   
...          ...                                            ...   
14823  tt0219952                                  Lucky Numbers   
14824  tt1371159                                     Iron Man 2   
14825  tt0063443                                     Play Dirty   
14826  tt0039464                                      High Wall   
14827  tt0235166                               Against All Hope   

                                           plot_synopsis  \
0      Note: this synopsis is for the orginal Italian...   
1      Two thousand years ago, Nhagruul the Foul, a s...   
2      Matusche

**Preprocessing**

In [None]:
# Drop unnecessary columns ('imdb_id', 'split', 'synopsis_source') from the DataFrame 'data'
data = data.drop(labels=['imdb_id', 'split', 'synopsis_source'], axis=1)

# Count the occurrences of each unique tag and select the top 15 most frequent tags
tag_class = data["tags"].value_counts().head(15)

# Define a list of desired tags for filtering
desired_tags = ["murder", "romantic", "violence", "psychedelic", "comedy"]

# Filter the DataFrame 'data' to include only rows with tags present in the 'desired_tags' list
filtered_mt = data[data['tags'].isin(desired_tags)]

# Reset the index of the resulting DataFrame and drop the old index
filtered_mt = filtered_mt.reset_index(drop=True)

# Save the filtered DataFrame to a CSV file in a specified drive location
filtered_mt.to_csv('/content/drive/MyDrive/Database/filtered_mt.csv', index=False)

# Print the resulting filtered DataFrame
print(filtered_mt)

                                              title  \
0     Dungeons & Dragons: The Book of Vile Darkness   
1                        The Shop Around the Corner   
2                                     Little Caesar   
3                                  The Last Emperor   
4                                     Taste of Fear   
...                                             ...   
2908                                Saddle the Wind   
2909                          The Bridge at Remagen   
2910                 The Curse of the Jade Scorpion   
2911                              One Night of Love   
2912                                      High Wall   

                                          plot_synopsis      tags  
0     Two thousand years ago, Nhagruul the Foul, a s...  violence  
1     Matuschek's, a gift store in Budapest, is the ...  romantic  
2     Small-time Italian-American criminals Caesar E...  violence  
3     Arrival.\nA train pulls into a station in Nort...    murder  

In [None]:
# Combine the 'title' and 'plot_synopsis' columns into a new 'texts' column in the DataFrame 'filtered_mt'
filtered_mt['texts'] = filtered_mt['title'] + ' ' + filtered_mt['plot_synopsis']

# Drop the columns 'title' and 'plot_synopsis' from the DataFrame
filtered_mt = filtered_mt.drop(labels=['title', 'plot_synopsis'], axis=1)

# Rename the 'tags' column to 'labels' in the DataFrame
filtered_mt = filtered_mt.rename(columns={'tags': 'labels'})

# Create a mapping from tags to numerical values based on the 'desired_tags' list
tag_mapping = {tag: str(i) for i, tag in enumerate(desired_tags)}

# Map the 'labels' column using the created tag mapping
filtered_mt['labels'] = filtered_mt['labels'].map(tag_mapping)

# Print the resulting DataFrame after the transformations
print(filtered_mt)

     labels                                              texts
0         2  Dungeons & Dragons: The Book of Vile Darkness ...
1         1  The Shop Around the Corner Matuschek's, a gift...
2         2  Little Caesar Small-time Italian-American crim...
3         0  The Last Emperor Arrival.\nA train pulls into ...
4         0  Taste of Fear ** CONTAINS SPOILERSIn England, ...
...     ...                                                ...
2908      0  Saddle the Wind Retired gunslinger and former ...
2909      0  The Bridge at Remagen The film opens with the ...
2910      4  The Curse of the Jade Scorpion In 1940, C.W. B...
2911      1  One Night of Love Opera singer Mary Barrett (G...
2912      0  High Wall Steven Kenet catches his unfaithful ...

[2913 rows x 2 columns]


In [None]:
# Convert the 'texts' and 'labels' columns from the DataFrame 'filtered_mt' to Python lists
texts = filtered_mt['texts'].tolist()
labels = filtered_mt['labels'].tolist()

# Clear the GPU memory by emptying the CUDA cache
torch.cuda.empty_cache()

**Training model**

In [None]:
# Import libraries
from sklearn.model_selection import train_test_split
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader
from transformers import BertTokenizer, get_linear_schedule_with_warmup
from torch.optim import AdamW
import numpy as np

# Set up random number with seed=42
np.random.seed(42)
# Set up parameters
bert_model_name = 'bert-base-cased'
num_classes = 5
batch_size = 16
num_epochs = 10
learning_rate = 2e-5
max_length = 512

# Split the dataset into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Instantiate a BERT tokenizer from the pre-trained BERT model
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Create datasets and dataloaders for training and validation
train_dataset = MovieDataset(train_texts, train_labels, tokenizer, max_length)
val_dataset = MovieDataset(val_texts, val_labels, tokenizer, max_length)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Check and set the device to CUDA (GPU) if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the BERT-based classifier model and move it to the specified device
model = BERTClassifier(bert_model_name, num_classes).to(device)

# Set up the AdamW optimizer for model training
optimizer = AdamW(model.parameters(), lr=learning_rate)

# Calculate the total number of training steps for the learning rate scheduler
total_steps = len(train_dataloader) * num_epochs

# Create a linear learning rate scheduler with warm-up steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

# Clear the GPU memory to ensure sufficient space
torch.cuda.empty_cache()

# Initialize a variable to track the best validation accuracy
best_accuracy = 0.0

# Iterate over epochs for model training
for epoch in range(num_epochs):
    print(f"Epoch {epoch + 1}/{num_epochs}")

    # Train the model on the training dataset
    train(model, train_dataloader, optimizer, scheduler, device)

    # Evaluate the model on the validation dataset and get accuracy and classification report
    accuracy, report = evaluate(model, val_dataloader, device)

    # Save the model if the current accuracy is better than the previous best accuracy
    if accuracy > best_accuracy:
        torch.save(model.state_dict(), "bert_classifier.pth")
        best_accuracy = accuracy

    # Print the current validation accuracy and classification report
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(report)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Epoch 1/10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6295
              precision    recall  f1-score   support

           0       0.62      0.83      0.71       218
           1       0.67      0.61      0.64       135
           2       0.75      0.36      0.48       109
           3       0.55      0.77      0.64        86
           4       0.00      0.00      0.00        35

    accuracy                           0.63       583
   macro avg       0.52      0.51      0.49       583
weighted avg       0.61      0.63      0.60       583

Epoch 2/10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6192
              precision    recall  f1-score   support

           0       0.71      0.55      0.62       218
           1       0.69      0.70      0.70       135
           2       0.45      0.76      0.56       109
           3       0.70      0.74      0.72        86
           4       0.00      0.00      0.00        35

    accuracy                           0.62       583
   macro avg       0.51      0.55      0.52       583
weighted avg       0.61      0.62      0.60       583

Epoch 3/10


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Validation Accuracy: 0.6895
              precision    recall  f1-score   support

           0       0.69      0.74      0.72       218
           1       0.72      0.74      0.73       135
           2       0.58      0.73      0.65       109
           3       0.84      0.71      0.77        86
           4       0.00      0.00      0.00        35

    accuracy                           0.69       583
   macro avg       0.56      0.58      0.57       583
weighted avg       0.66      0.69      0.67       583

Epoch 4/10
Validation Accuracy: 0.6861
              precision    recall  f1-score   support

           0       0.70      0.75      0.73       218
           1       0.66      0.73      0.69       135
           2       0.65      0.66      0.66       109
           3       0.72      0.76      0.74        86
           4       1.00      0.03      0.06        35

    accuracy                           0.69       583
   macro avg       0.75      0.58      0.57       583
weighted a

**Test Bert with a Prompt of Movies**

In [None]:
# Define a function for making predictions with the BERT-based classifier model
def prediction(text, model, tokenizer, device, max_length=512):
    # Set the model to evaluation mode
    model.eval()

    # Tokenize the input text and create input tensors
    encoding = tokenizer(text, return_tensors='pt', max_length=max_length, padding='max_length', truncation=True)
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Disable gradient computation during prediction
    with torch.no_grad():
        # Forward pass through the model
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)

        # Get the predicted labels
        _, preds = torch.max(outputs, dim=1)

    return preds

# Set up the BERT model and related components
bert_model_name = 'bert-base-cased'
num_classes = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained(bert_model_name)

# Instantiate the BERT-based classifier model and move it to the specified device
model = BERTClassifier(bert_model_name, num_classes).to(device)

# Load the pre-trained model state
model.load_state_dict(torch.load('/content/bert_classifier.pth', map_location=device))

# Set the model to evaluation mode
model.eval()

# Define test text examples and corresponding tags
test_text = ["\"Titanic\" is a 1997 epic romance and disaster film directed by James Cameron. ...",
             "\"Rocky\" is a 1976 American sports drama film written and starring Sylvester Stallone. ...",
             "\"Airplane!\" (1980), directed by Jim Abrahams and the Zucker brothers. ..."]
tags = ["murder", "romantic", "violence", "psychedelic", "comedy"]

# Iterate over test examples and make predictions
for i in range(len(test_text)):
    print(f"{i+1}) {test_text[i]}")

    # Make predictions using the defined function
    predicted_tags = prediction(test_text[i], model, tokenizer, device)

    # Print the best predicted tag for each example
    for j in range(len(tags)):
        if predicted_tags == j:
            tag = tags[j]
            print(f"    The best tag for this film is: {tag}")

1) "Titanic" is a 1997 epic romance and disaster film directed by James Cameron. The story revolves around the ill-fated maiden voyage of the RMS Titanic in 1912. The film follows the romance between Jack Dawson, a penniless artist played by Leonardo DiCaprio, and Rose DeWitt Bukater, an upper-class woman engaged to a wealthy industrialist, played by Kate Winslet. The love story unfolds against the backdrop of the luxurious but doomed ocean liner. As the ship collides with an iceberg and tragedy strikes, Jack and Rose must navigate the chaos and danger to survive.
    The best tag for this film is: romantic
2) "Rocky" is a 1976 American sports drama film written and starring Sylvester Stallone. The film follows the story of Rocky Balboa, a small-time boxer from Philadelphia, who gets a shot at the world heavyweight championship. Despite being an underdog, Rocky seizes the opportunity to train rigorously and face the reigning champion, Apollo Creed, in a match that becomes a symbol of d