# Movie Character Moral Inference

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn as sk
import json
import torch

Turning the JSON file into pandas dataframe

In [10]:
# First, read the JSON file
with open("../data/dialogue.json", "r") as file:
    dialogue_dict = json.load(file)

# Create a list to store all dialogues
dialogue_list = []

# Handle the nested structure correctly
for movie, data in dialogue_dict.items():
    # Check if data is a dictionary or list
    if isinstance(data, dict):
        # If it's a dictionary, iterate through characters
        for character, lines in data.items():
            if isinstance(lines, list):
                for line in lines:
                    dialogue_list.append({
                        'movie': movie,
                        'character': character,
                        'dialogue': line
                    })
    elif isinstance(data, list):
        # If it's directly a list of lines
        for line in data:
            dialogue_list.append({
                'movie': movie,
                'dialogue': line
            })

# Create the DataFrame
dialogue_df = pd.DataFrame(dialogue_list)

Turning the json into python dictionary

In [23]:
# Read the JSON file
with open("../data/dialogue.json", "r") as file:
    raw_dialogue = json.load(file)

# Create a nested dictionary structure
dialogue = {}

for movie, data in raw_dialogue.items():
    # Initialize movie dictionary if not exists
    if movie not in dialogue:
        dialogue[movie] = {}
        
    # If data is a dictionary (contains character information)
    if isinstance(data, dict):
        for character, lines in data.items():
            if isinstance(lines, list):
                dialogue[movie][character] = lines
    # If data is a list (direct dialogues without character info)
    elif isinstance(data, list):
        dialogue[movie]['unknown'] = data

In [11]:
dialogue_df.head()

Unnamed: 0,movie,character,dialogue
0,10 Things I Hate About You,KAT,Leave it
1,10 Things I Hate About You,KAT,Why didn't we just read the Hardy Boys?
2,10 Things I Hate About You,KAT,This book is about a guy and his fishing habi...
3,10 Things I Hate About You,KAT,"(continuing) Frankly, I'm baffled as to why w..."
4,10 Things I Hate About You,KAT,I guess the school board thinks because Hemin...


In [15]:
dialogue_df[dialogue_df["movie"]== "10 Things I Hate About You"]["character"].value_counts()

character
KAT           219
PATRICK       187
BIANCA        131
CAMERON       105
MICHAEL        89
JOEY           62
WALTER         54
MANDELLA       40
MISS PERKY     24
MRS            12
CHASTITY       11
SHARON         11
Name: count, dtype: int64

In [30]:
# Loading moral foundations dictionary
moral_dict = pd.read_csv("../data/mfd_v2.csv")

# Convert DataFrame to dictionary for O(1) lookup
moral_word_dict = dict(zip(moral_dict['word'].str.lower(), moral_dict['category']))

In [35]:
# Let's count the number of characters we have in the dataset
count = 0
for movie, data in dialogue.items():
    count += len(data)
print("Number of Movies", len(dialogue))
print("Number of characters", count)
print("Number of dialogues", dialogue_df.shape[0])

Number of Movies 1134
Number of characters 12853
Number of dialogues 788415


In [41]:
# Extract unique movie names from the dataframe
unique_movies = dialogue_df['movie'].unique()

# Print the names of the movies
for movie in unique_movies:
    print(movie)

10 Things I Hate About You
12
12 and Holding
12 Monkeys
12 Years a Slave
127 Hours
1492: Conquest of Paradise
15 Minutes
17 Again
187
2001: A Space Odyssey
2012
28 Days Later
30 Minutes or Less
42
44 Inch Chest
48 Hrs.
50-50
500 Days of Summer
8MM
A Few Good Men
A Most Violent Year
A Prayer Before Dawn
A Quiet Place
A Scanner Darkly
A Serious Man
Above the Law
Absolute Power
Abyss, The
Ace Ventura: Pet Detective
Adaptation
Adjustment Bureau, The
Adventures of Buckaroo Banzai Across the Eighth Dimension, The
Affliction
After School Special
After.Life
Agnes of God
Air Force One
Airplane
Airplane 2: The Sequel
Ali
Alien
Alien 3
Alien Nation
Alien vs. Predator
Aliens
All About Eve
All About Steve
All the King's Men
All the President's Men
Almost Famous
Alone in the Dark
Amadeus
Amelia
American Beauty
American Gangster
American Graffiti
American History X
American Hustle
American Milkshake
American Pie
American President, The
American Psycho
American Shaolin: King of Kickboxers II
American 

In [31]:
print(moral_dict.head())
print("\nData types of columns:")
print(moral_dict.dtypes)

   category        word
0         1  compassion
1         1     empathy
2         1    kindness
3         1      caring
4         1  generosity

Data types of columns:
category     int64
word        object
dtype: object


#### Movies I've seen
- Zootopia
- How to Train Your Dragon
- How to Train Your Dragon 2
- Frozen
- Cars 2
- Chronicles of Narnia: The Lion, the Witch and the Wardrobe
- Interstellar
- John Wick
- Up
- Wall-E

#### Moral foundations
1. Care/Virtue (compassion, empathy, kindness)
2. Harm/Vice (harm, suffer, hurt)
3. Fairness/Virtue (equality, fairness, justice)
4. Cheating/Vice (cheat, unfair, cheating)
5. Loyalty/Virtue (loyalty, patriot, team player)
6. Betrayal/Vice (traitor, disloyal, treason)
7. Authority/Virtue (respect, obey, authority)
8. Subversion/Vice (disrespect, disobey, chaos)
9. Purity/Virtue (sanctity, sacred, purity)
10. Degradation/Vice (impurity, degradation, depravity)

### Clustering 

We will try to use simple to advanced methods to cluster the moral of the characters based on their speech

Methods to try:
- Frequency based clustering
- K-means
- Autoencoders
- BERT-based model
- Sentence-BERT
- word Embeddings
- LLM representation 

#### Frequency Based Clustering

In [38]:
from collections import defaultdict

def get_category_frequencies(speeches):
    category_counts = [0] * 10  # For categories 1-10
    total_words = 0
    
    # Process all speeches
    for speech in speeches:
        words = speech.lower().split()
        total_words += len(words)
        
        # Count words that appear in moral dictionary
        for word in words:
            if word in moral_word_dict:
                category = moral_word_dict[word]
                category_counts[category-1] += 1
    
    # Calculate frequencies
    frequencies = [count/total_words if total_words > 0 else 0 for count in category_counts]
    return frequencies, total_words

def process_characters(minimum_total_words = 50, movie="" ):
    character_categories = []
    
    if movie:
        for character, speeches in dialogue[movie].items():
            frequencies, total_words = get_category_frequencies(speeches)
            
            # Only include characters with substantial dialogue
            if total_words >= minimum_total_words:  # Minimum word threshold
                dominant_category = np.argmax(frequencies) + 1
                character_categories.append({
                    'movie': movie,
                    'character': character,
                    'dominant_category': dominant_category,
                    'frequency': frequencies[dominant_category-1],
                    'total_words': total_words,
                    'all_frequencies': frequencies
                })
    else:
        # Process characters
        character_categories = []

        for movie, characters in dialogue.items():
            for character, speeches in characters.items():
                frequencies, total_words = get_category_frequencies(speeches)
                
                # Only include characters with substantial dialogue
                if total_words >= minimum_total_words:  # Minimum word threshold
                    dominant_category = np.argmax(frequencies) + 1
                    character_categories.append({
                        'movie': movie,
                        'character': character,
                        'dominant_category': dominant_category,
                        'frequency': frequencies[dominant_category-1],
                        'total_words': total_words,
                        'all_frequencies': frequencies
                    })

    # Group and display results
    category_groups = {i: [] for i in range(1, 11)}
    for char in character_categories:
        category_groups[char['dominant_category']].append(char)

    # Print results
    for category in range(1, 11):
        chars = category_groups[category]
        if chars:
            print(f"\nMoral Category {category}:")
            print(f"Total characters: {len(chars)}")
            
            # Sort just the top 5
            top_chars = sorted(chars, key=lambda x: x['frequency'], reverse=True)[:5]
            print("\nTop characters:")
            for char in top_chars:
                print(f"- {char['character']} from {char['movie']}")
                print(f"  Frequency: {char['frequency']:.5f}")
    
    return character_categories

In [89]:
dialogue["Cars 2"]["ENGINE VOICE"]

[' I wish I could be with you on this very special day but... my clutch assembly broke. You know how it is. ',
 " We are here to celebrate. Today all your hard work pays off. The world turned their backs on cars like us. They stopped manufacturing us, stopped making our parts. The only thing they haven't stopped doing is laughing at us. They've called us terrible names... ",
 ' Jalopy. Rustbucket. ',
 ' Heap. Clunker. ',
 ' Junker, beater, wreck. ',
 ' Rattletrap. ',
 ' Lemon. But their insults just give us strength. Because today, my friends... ',
 ' ...that all ends. ',
 " They laughed at us. But now it's our turn to laugh back. ",
 ' Embrace your inner lemon! Let it drive you! ',
 " This was meant to be alternative fuel's greatest moment. ",
 ' After today everyone will race back to gasoline. ',
 " And we, the owners of the world's largest untapped oil reserve, will become the most powerful cars in the world! ",
 ' They will come to us and they will have no choice, `cause they will 

In [42]:
# Test the function
process_characters(50, "Zootopia")


Moral Category 1:
Total characters: 8

Top characters:
- JUDY from Zootopia
  Frequency: 0.01190
- LIONHEART from Zootopia
  Frequency: 0.00797
- DUKE WEASELTON from Zootopia
  Frequency: 0.00637
- BELLWETHER from Zootopia
  Frequency: 0.00370
- HOPPS from Zootopia
  Frequency: 0.00341

Moral Category 2:
Total characters: 2

Top characters:
- GAZELLE from Zootopia
  Frequency: 0.00877
- GIDEON GREY from Zootopia
  Frequency: 0.00621

Moral Category 3:
Total characters: 1

Top characters:
- MR from Zootopia
  Frequency: 0.00755

Moral Category 4:
Total characters: 1

Top characters:
- BOGO from Zootopia
  Frequency: 0.00173

Moral Category 5:
Total characters: 1

Top characters:
- YOUNG JUDY from Zootopia
  Frequency: 0.00472

Moral Category 7:
Total characters: 1

Top characters:
- BONNIE HOPPS from Zootopia
  Frequency: 0.01158

Moral Category 9:
Total characters: 1

Top characters:
- STU HOPPS from Zootopia
  Frequency: 0.00471


[{'movie': 'Zootopia',
  'character': 'YOUNG JUDY',
  'dominant_category': np.int64(5),
  'frequency': 0.0047169811320754715,
  'total_words': 212,
  'all_frequencies': [0.0,
   0.0,
   0.0,
   0.0,
   0.0047169811320754715,
   0.0,
   0.0047169811320754715,
   0.0,
   0.0,
   0.0]},
 {'movie': 'Zootopia',
  'character': 'GIDEON GREY',
  'dominant_category': np.int64(2),
  'frequency': 0.006211180124223602,
  'total_words': 161,
  'all_frequencies': [0.0,
   0.006211180124223602,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0,
   0.0]},
 {'movie': 'Zootopia',
  'character': 'STU HOPPS',
  'dominant_category': np.int64(9),
  'frequency': 0.004705882352941176,
  'total_words': 425,
  'all_frequencies': [0.002352941176470588,
   0.0,
   0.0,
   0.002352941176470588,
   0.0,
   0.0,
   0.002352941176470588,
   0.0,
   0.004705882352941176,
   0.0]},
 {'movie': 'Zootopia',
  'character': 'BONNIE HOPPS',
  'dominant_category': np.int64(7),
  'frequency': 0.011583011583011582,
  'tota

#### BERT

In [57]:
from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from collections import defaultdict

# 1. Load BERT model and tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# 2. Create training data from moral dictionary
def create_moral_examples():
    # Group words by category
    category_words = defaultdict(list)
    for _, row in moral_dict.iterrows():
        category_words[row['category']].append(row['word'])
    return category_words

# 3. Function to get BERT embeddings
def get_bert_embedding(text):
    # Tokenize and get BERT embeddings
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    # Use [CLS] token embedding (first token)
    return outputs.last_hidden_state[:, 0, :].numpy()

# 4. Process characters
def embedding(movie="", minimum_total_speech=0):
    character_embeddings = []
    character_info = []

    if movie:
        characters = dialogue[movie]
        for character, speeches in characters.items():
            if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                # Process each speech
                speech_embeddings = []
                for speech in speeches:
                    embedding = get_bert_embedding(speech)
                    speech_embeddings.append(embedding[0])  # Remove batch dimension
                
                # Average embeddings for the character
                character_embedding = np.mean(speech_embeddings, axis=0)
                character_embeddings.append(character_embedding)
                character_info.append((movie, character))
    else:
        for movie, characters in dialogue.items():
            for character, speeches in characters.items():
                if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                    # Process each speech
                    speech_embeddings = []
                    for speech in speeches:
                        embedding = get_bert_embedding(speech)
                        speech_embeddings.append(embedding[0])  # Remove batch dimension
                    
                    # Average embeddings for the character
                    character_embedding = np.mean(speech_embeddings, axis=0)
                    character_embeddings.append(character_embedding)
                    character_info.append((movie, character))
    
    print(f"Total characters: {len(character_embeddings)}")

    # 5. Create moral category centroids
    moral_categories = create_moral_examples()
    category_centroids = {}

    for category, words in moral_categories.items():
        # Get embeddings for each word in category
        word_embeddings = []
        for word in words:
            embedding = get_bert_embedding(word)
            word_embeddings.append(embedding[0])
        
        # Average to get category centroid
        category_centroids[category] = np.mean(word_embeddings, axis=0)
    
    return character_embeddings, character_info, category_centroids

# 6. Assign characters to categories
def assign_category(embedding, centroids):
    # Calculate distance to each centroid
    distances = {}
    for category, centroid in centroids.items():
        distance = np.linalg.norm(embedding - centroid)
        distances[category] = distance
    
    # Return category with minimum distance
    return min(distances.items(), key=lambda x: x[1])[0]

def classify_categories(character_embeddings, character_info, category_centroids):
    # 7. Classify characters
    character_categories = []
    for idx, embedding in enumerate(character_embeddings):
        movie, character = character_info[idx]
        category = assign_category(embedding, category_centroids)
        character_categories.append({
            'movie': movie,
            'character': character,
            'category': category
        })

    # 8. Analyze results
    for category in range(1, 11):
        chars = [c for c in character_categories if c['category'] == category]
        if chars:
            print(f"\nMoral Category {category}:")
            print(f"Total characters: {len(chars)}")
            print("\nExample characters:")
            for char in chars[:5]:
                print(f"- {char['character']} from {char['movie']}")

def cluster_characters_bert(movie="", minimum_total_speech=0):
    # Get character embeddings and moral category centroids
    character_embeddings, character_info, category_centroids = embedding(movie, minimum_total_speech)
    
    # Classify characters into moral categories
    classify_categories(character_embeddings, character_info, category_centroids)

In [88]:
cluster_characters_bert("Cars 2", 0)

Total characters: 19
[{'movie': 'Cars 2', 'character': 'FINN', 'category': 7}, {'movie': 'Cars 2', 'character': 'PROFESSOR ZUNDAPP', 'category': 2}, {'movie': 'Cars 2', 'character': 'GREM', 'category': 7}, {'movie': 'Cars 2', 'character': 'ACER', 'category': 4}, {'movie': 'Cars 2', 'character': 'MATER', 'category': 10}, {'movie': 'Cars 2', 'character': 'OTIS', 'category': 10}, {'movie': 'Cars 2', 'character': 'LUIGI', 'category': 10}, {'movie': 'Cars 2', 'character': 'MCQUEEN', 'category': 8}, {'movie': 'Cars 2', 'character': 'SALLY', 'category': 1}, {'movie': 'Cars 2', 'character': 'MEL DORADO', 'category': 7}, {'movie': 'Cars 2', 'character': 'MILES AXLEROD', 'category': 4}, {'movie': 'Cars 2', 'character': 'FRANCESCO', 'category': 2}, {'movie': 'Cars 2', 'character': 'HOLLEY', 'category': 7}, {'movie': 'Cars 2', 'character': 'BRENT MUSTANGBURGER', 'category': 7}, {'movie': 'Cars 2', 'character': 'DAVID HOBBSCAP', 'category': 7}, {'movie': 'Cars 2', 'character': 'DARRELL CARTRIP', 'c

#### Sentence-BERT

In [86]:
from sentence_transformers import SentenceTransformer
import numpy as np
from collections import defaultdict

# 1. Load SBERT model
sbert_model = SentenceTransformer('all-mpnet-base-v2')  # Better performance than base BERTe BERTe BERT

# 2. Create training data from moral dictionary
def create_moral_examples():
    category_words = defaultdict(list)
    for _, row in moral_dict.iterrows():
        category_words[row['category']].append(row['word'])
    return category_words

# 3. Process characters
def embedding(movie="", minimum_total_speech=0):
    character_embeddings = []
    character_info = []

    if movie:
        characters = dialogue[movie]
        for character, speeches in characters.items():
            if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                # Process all speeches at once (SBERT is optimized for batch processing)
                speech_embeddings = sbert_model.encode(speeches)
                
                # Average embeddings for the character
                character_embedding = np.mean(speech_embeddings, axis=0)
                character_embeddings.append(character_embedding)
                character_info.append((movie, character))
    else:
        for movie, characters in dialogue.items():
            for character, speeches in characters.items():
                if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                    # Process all speeches at once (SBERT is optimized for batch processing)
                    speech_embeddings = sbert_model.encode(speeches)
                    
                    # Average embeddings for the character
                    character_embedding = np.mean(speech_embeddings, axis=0)
                    character_embeddings.append(character_embedding)
                    character_info.append((movie, character))

    print(f"Total characters: {len(character_embeddings)}")

    # 4. Create moral category centroids
    moral_categories = create_moral_examples()
    category_centroids = {}

    for category, words in moral_categories.items():
        # Encode all words in category at once
        word_embeddings = sbert_model.encode(words)
        
        # Average to get category centroid
        category_centroids[category] = np.mean(word_embeddings, axis=0)
    
    return character_embeddings, character_info, category_centroids

# 5. Assign characters to categories
def assign_category(embedding, centroids):
    # Calculate cosine similarity instead of Euclidean distance
    similarities = {}
    for category, centroid in centroids.items():
        similarity = np.dot(embedding, centroid) / (np.linalg.norm(embedding) * np.linalg.norm(centroid))
        similarities[category] = similarity
    
    # Return category with highest similarity
    return max(similarities.items(), key=lambda x: x[1])[0]


def classify_categories(character_embeddings, character_info, category_centroids):

    # 6. Assign characters to categories
    character_categories = []
    for idx, embedding in enumerate(character_embeddings):
        movie, character = character_info[idx]
        category = assign_category(embedding, category_centroids)
        character_categories.append({
            'movie': movie,
            'character': character,
            'category': category
        })

    # 7. Analyze results
    for category in range(1, 11):
        chars = [c for c in character_categories if c['category'] == category]
        if chars:
            print(f"\nMoral Category {category}:")
            print(f"Total characters: {len(chars)}")
            print("\nExample characters:")
            for char in chars[:5]:
                print(f"- {char['character']} from {char['movie']}")

def cluster_characters_sbert(movie="", minimum_total_speech=0):
    # Get character embeddings and moral category centroids
    character_embeddings, character_info, category_centroids = embedding(movie, minimum_total_speech)
    
    # Classify characters into moral categories
    classify_categories(character_embeddings, character_info, category_centroids)

In [87]:
cluster_characters_sbert("Cars 2", 0)

Total characters: 19
[{'movie': 'Cars 2', 'character': 'FINN', 'category': 7}, {'movie': 'Cars 2', 'character': 'PROFESSOR ZUNDAPP', 'category': 2}, {'movie': 'Cars 2', 'character': 'GREM', 'category': 7}, {'movie': 'Cars 2', 'character': 'ACER', 'category': 4}, {'movie': 'Cars 2', 'character': 'MATER', 'category': 10}, {'movie': 'Cars 2', 'character': 'OTIS', 'category': 10}, {'movie': 'Cars 2', 'character': 'LUIGI', 'category': 10}, {'movie': 'Cars 2', 'character': 'MCQUEEN', 'category': 8}, {'movie': 'Cars 2', 'character': 'SALLY', 'category': 1}, {'movie': 'Cars 2', 'character': 'MEL DORADO', 'category': 7}, {'movie': 'Cars 2', 'character': 'MILES AXLEROD', 'category': 4}, {'movie': 'Cars 2', 'character': 'FRANCESCO', 'category': 2}, {'movie': 'Cars 2', 'character': 'HOLLEY', 'category': 7}, {'movie': 'Cars 2', 'character': 'BRENT MUSTANGBURGER', 'category': 7}, {'movie': 'Cars 2', 'character': 'DAVID HOBBSCAP', 'category': 7}, {'movie': 'Cars 2', 'character': 'DARRELL CARTRIP', 'c

#### Auto-encoders

In [105]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np

# 1. Load SBERT model
model = SentenceTransformer('all-mpnet-base-v2')

def cluster_characters_sbert_autoencoder(movie="", minimum_total_speech=0):
    # 2. Get character embeddings
    character_embeddings = []
    character_info = []

    if movie:
        characters = dialogue[movie]
        for character, speeches in characters.items():
            if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                # Process all speeches at once (SBERT is optimized for batch processing)
                speech_embeddings = sbert_model.encode(speeches)
                
                # Average embeddings for the character
                character_embedding = np.mean(speech_embeddings, axis=0)
                character_embeddings.append(character_embedding)
                character_info.append((movie, character))
    else:
        for movie, characters in dialogue.items():
            for character, speeches in characters.items():
                if len(speeches) >= minimum_total_speech:  # Minimum dialogue threshold
                    # Process all speeches at once (SBERT is optimized for batch processing)
                    speech_embeddings = sbert_model.encode(speeches)
                    
                    # Average embeddings for the character
                    character_embedding = np.mean(speech_embeddings, axis=0)
                    character_embeddings.append(character_embedding)
                    character_info.append((movie, character))

        print(f"Total characters: {len(character_embeddings)}")

    # 1. Process moral dictionary first
    moral_categories = defaultdict(list)
    for _, row in moral_dict.iterrows():
        moral_categories[row['category']].append(row['word'])

    # Get embeddings for moral dictionary words
    category_embeddings = {}
    for category, words in moral_categories.items():
        word_embeddings = model.encode(words)  # Shape: (num_words, 768)
        category_embedding = np.mean(word_embeddings, axis=0)  # Shape: (768,)
        category_embeddings[category] = category_embedding

    # Create a tensor of all category embeddings
    category_embeddings_tensor = torch.FloatTensor(list(category_embeddings.values()))  # Shape: (10, 768)

    # 2. Create dataset class with both character embeddings and moral category embeddings
    class MoralDataset(Dataset):
        def __init__(self, char_embeddings, category_embeddings):
            self.char_embeddings = torch.FloatTensor(char_embeddings)
            self.category_embeddings = torch.FloatTensor(list(category_embeddings.values()))
        
        def __len__(self):
            return len(self.char_embeddings)
        
        def __getitem__(self, idx):
            return self.char_embeddings[idx], self.category_embeddings

    # 3. Modified autoencoder to consider moral categories
    class MoralAutoencoder(nn.Module):
        def __init__(self, input_dim, num_categories=10):
            super(MoralAutoencoder, self).__init__()
            
            self.encoder = nn.Sequential(
                nn.Linear(input_dim, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Linear(256, 128),
                nn.BatchNorm1d(128),
                nn.ReLU(),
                nn.Linear(128, num_categories)
            )
            
            self.decoder = nn.Sequential(
                nn.Linear(num_categories, 128),
                nn.BatchNorm1d(128),
                nn.ReLU(),
                nn.Linear(128, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Linear(256, input_dim)
            )
        
        def forward(self, x):
            encoded = self.encoder(x)
            decoded = self.decoder(encoded)
            return encoded, decoded

    def moral_loss(decoded, original, encoded, category_embeddings_tensor):
        reconstruction_loss = nn.MSELoss()(decoded, original)
        
        # Moral category alignment loss using category_embeddings_tensor
        moral_loss = 0
        for i in range(encoded.size(0)):
            similarities = torch.matmul(encoded[i], category_embeddings_tensor)
            moral_loss += -torch.log_softmax(similarities, dim=0).mean()
        
        return reconstruction_loss + moral_loss

    # 5. Training process
    input_dim = character_embeddings[0].shape[0]
    autoencoder = MoralAutoencoder(input_dim)
    optimizer = optim.Adam(autoencoder.parameters())

    dataset = MoralDataset(character_embeddings, category_embeddings)
    dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

    # Training loop
    n_epochs = 100
    for epoch in range(n_epochs):
        total_loss = 0
        for batch_chars, batch_categories in dataloader:
            # Forward pass
            encoded, decoded = autoencoder(batch_chars)
            
            # Compute loss
            loss = moral_loss(decoded, batch_chars, encoded, batch_categories)
            
            # Backward pass
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
        
        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {total_loss/len(dataloader):.4f}')

    # 6. Get encodings and classify
    autoencoder.eval()
    character_categories = []

    with torch.no_grad():
        for idx, embedding in enumerate(character_embeddings):
            embedding_tensor = torch.FloatTensor(embedding)
            encoded, _ = autoencoder(embedding_tensor)
            
            # Get the encoded representation (should be 10-dimensional)
            encoded = encoded.squeeze()  # Remove batch dimension if present
            
            # The encoded vector itself should represent moral categories
            # Each dimension corresponds to a category 1-10
            best_category = int(torch.argmax(encoded)) + 1  # +1 because categories are 1-based
            confidence = float(torch.softmax(encoded, dim=0)[best_category-1])
            
            character_categories.append({
                'movie': character_info[idx][0],
                'character': character_info[idx][1],
                'category': best_category,
                'confidence': confidence
            })
    
    print(character_categories)

    # 7. Analyze results
    for category in range(1, 11):
        chars = [c for c in character_categories if c['category'] == category]
        if chars:
            print(f"\nMoral Category {category}:")
            print(f"Total characters: {len(chars)}")
            print("\nExample characters:")
            for char in chars[:5]:
                print(f"- {char['character']} from {char['movie']}")

In [106]:
cluster_characters_sbert_autoencoder("Cars 2", 0)

Epoch [10/100], Loss: 55.9446
Epoch [20/100], Loss: 55.9445
Epoch [30/100], Loss: 55.9445
Epoch [40/100], Loss: 55.9444
Epoch [50/100], Loss: 55.9444
Epoch [60/100], Loss: 55.9444
Epoch [70/100], Loss: 55.9444
Epoch [80/100], Loss: 55.9444
Epoch [90/100], Loss: 55.9444
Epoch [100/100], Loss: 55.9444
[{'movie': 'Cars 2', 'character': 'FINN', 'category': 6, 'confidence': 0.11400000005960464}, {'movie': 'Cars 2', 'character': 'PROFESSOR ZUNDAPP', 'category': 6, 'confidence': 0.11202491074800491}, {'movie': 'Cars 2', 'character': 'GREM', 'category': 6, 'confidence': 0.11451436579227448}, {'movie': 'Cars 2', 'character': 'ACER', 'category': 6, 'confidence': 0.11374667286872864}, {'movie': 'Cars 2', 'character': 'MATER', 'category': 6, 'confidence': 0.11372475326061249}, {'movie': 'Cars 2', 'character': 'OTIS', 'category': 6, 'confidence': 0.11273546516895294}, {'movie': 'Cars 2', 'character': 'LUIGI', 'category': 6, 'confidence': 0.11392683535814285}, {'movie': 'Cars 2', 'character': 'MCQUE

[{'movie': '10 Things I Hate About You', 'character': 'KAT', 'dominant_category': np.int64(10), 'frequency': 0.0037433155080213902, 'total_words': 1870, 'all_frequencies': [0.0005347593582887701, 0.0021390374331550803, 0.0010695187165775401, 0.0, 0.0005347593582887701, 0.0, 0.00267379679144385, 0.0, 0.0005347593582887701, 0.0037433155080213902]}, {'movie': '10 Things I Hate About You', 'character': 'BIANCA', 'dominant_category': np.int64(7), 'frequency': 0.003189792663476874, 'total_words': 1254, 'all_frequencies': [0.0023923444976076554, 0.001594896331738437, 0.0007974481658692185, 0.0007974481658692185, 0.0007974481658692185, 0.0, 0.003189792663476874, 0.0, 0.0, 0.0007974481658692185]}, {'movie': '10 Things I Hate About You', 'character': 'CHASTITY', 'dominant_category': np.int64(1), 'frequency': 0.0, 'total_words': 107, 'all_frequencies': [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}, {'movie': '10 Things I Hate About You', 'character': 'MISS PERKY', 'dominant_category': np.in