# Baseline using FastText Embeddings

This acts as a baseline to be compared with the models using SBERT. 

Methodologies:
- For each sentence, we take the average of all word embeddings in the sentence to represent the sentence embedding
- We then do dimensionality reduction to get lower dimensional vectors since a too high dimension will cause overfitting
- For each character, we average the latent sentence embeddings to give us a character representation
- We pass this character embedding to predictor to get moral ratings

In [7]:
import json
import os
import sys
import numpy as np
import pandas as pd

In [8]:
with open("..//data//structured_data_full.json", "r") as f:
    structured_data_full = json.load(f)

In [9]:
import numpy as np
from collections import defaultdict
import re
from tqdm import tqdm

# Load FastText vectors manually (limit can help you load faster)
def load_fasttext_vecs(path, limit=200000):
    vectors = {}
    with open(path, 'r', encoding='utf-8') as f:
        next(f)  # skip header line
        for i, line in enumerate(f):
            if i >= limit:
                break
            parts = line.rstrip().split(' ')
            word = parts[0]
            vec = np.array(parts[1:], dtype=float)
            vectors[word] = vec
    return vectors

fasttext_vectors = load_fasttext_vecs("..//model//cc.en.300.vec", limit=200000)

# Helper
def tokenize(text):
    return re.findall(r"\b\w+\b", text.lower())

sentence_types = ["moral", "non_moral", "action", "adj"]
fasttext_embedding_dictionary = defaultdict(lambda: defaultdict(dict))

for movie, char_dict in tqdm(structured_data_full.items(), desc="Movies"):
    for character, type_dict in tqdm(char_dict.items(), desc=f"{movie} Characters", leave=False):
        for type_ in sentence_types:
            sentences = type_dict.get(type_, [])

            if not sentences:
                fasttext_embedding_dictionary[movie][character][type_] = []
                continue

            sentence_embeddings = []
            for sentence in sentences:
                tokens = tokenize(sentence)
                vectors = [fasttext_vectors[token] for token in tokens if token in fasttext_vectors]
                if vectors:
                    vec = np.mean(vectors, axis=0)
                    vec /= np.linalg.norm(vec) if np.linalg.norm(vec) > 0 else 1  # optional normalization
                else:
                    vec = np.zeros(300)
                sentence_embeddings.append(vec)

            fasttext_embedding_dictionary[movie][character][type_] = sentence_embeddings


Movies: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 97/97 [00:01<00:00, 91.29it/s]


In [11]:
trait_index_to_name = {
    14: "cunningâ€“honorable",
    22: "ferociousâ€“pacifist",
    25: "forgivingâ€“vengeful",
    28: "loyalâ€“traitorous",
    31: "rudeâ€“respectful",
    38: "arrogantâ€“humble",
    39: "heroicâ€“villainous",
    42: "mischievousâ€“well-behaved",
    62: "confidentâ€“insecure",
    64: "debasedâ€“purity",  # we added this trait
    79: "selfishâ€“altruistic",
    81: "angelicâ€“demonic",
    84: "cruelâ€“kind",
    85: "directâ€“roundabout",
    101: "biasedâ€“impartial",
    121: "sarcasticâ€“genuine",
    134: "obedientâ€“rebellious",   # we added this trait
    154: "judgementalâ€“accepting",
    195: "complimentaryâ€“insulting",
    222: "wholesomeâ€“salacious",
    # 224: "zanyâ€“regular",
    227: "racistâ€“egalitarian",
    390: "transparentâ€“machiavellian",
    396: "innocentâ€“jaded",
    # 425: "flawedâ€“perfect",
    434: "resentfulâ€“euphoric",
    441: "buffoonâ€“charmer",   # buffoon is a synonym for clown
    448: "fakeâ€“real",
    450: "cattyâ€“supportive",
    453: "eagerâ€“reluctant",
    464: "forwardâ€“repressed",
    485: "maverickâ€“conformist",
    # 487: "social chameleonâ€“strong identity",   # social chameleon is a person who changes their behavior to fit in with different social groups
    489: "sincereâ€“irreverent",
    # 494: "hopefulâ€“fearful",
    # 495: "likes changeâ€“resists change",
    # 497: "old-fashionedâ€“progressive"
}

In [2]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import r2_score, mean_squared_error
import torch
import torch.nn as nn
from tqdm import tqdm

# Configuration
latent_dim = 20
reduction_method = "pca"   # or "ae"
model_type = "ridge"       # or "mlp"

# selected_indices = [trait_index_dict[t] for t in selected_traits]
moral_trait_indices = [
    14, 22, 25, 28, 31, 38, 39, 42, 62, 64, 79, 81, 84, 85, 101, 121, 134, 154,
    195, 222, 227, 390, 396, 434, 441, 448, 450, 453,
    464, 485, 489
]


sentence_types = ["moral", "non_moral", "action", "adj"]

In [3]:
def run_moral_trait_prediction(
    fasttext_embedding_dictionary,
    structured_data_full,
    trait_index_to_name,
    moral_trait_indices,
    sentence_types=["moral", "non_moral", "action", "adj"],
    reduction_method="pca",
    model_type="ridge",
    latent_dim=20,
    ae_epochs=100
):
    import numpy as np
    from sklearn.decomposition import PCA
    from sklearn.linear_model import Ridge
    from sklearn.neural_network import MLPRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    import torch
    import torch.nn as nn
    from tqdm import tqdm
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler

    print("ðŸ“¦ Collecting all sentence embeddings...")
    all_embeddings = []
    for movie in fasttext_embedding_dictionary:
        for char in fasttext_embedding_dictionary[movie]:
            for t in sentence_types:
                all_embeddings.extend(fasttext_embedding_dictionary[movie][char].get(t, []))
    all_embeddings = np.array(all_embeddings)

    print(f"ðŸ”§ Training {reduction_method.upper()} reducer...")
    if reduction_method == "pca":
        reducer = PCA(n_components=latent_dim)
        reducer.fit(all_embeddings)
        reduce_fn = lambda X: reducer.transform(X)
    elif reduction_method == "ae":
        class AE(nn.Module):
            def __init__(self, input_dim=300, latent_dim=latent_dim):
                super().__init__()
                self.encoder = nn.Sequential(
                    nn.Linear(input_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, latent_dim)
                )
                self.decoder = nn.Sequential(
                    nn.Linear(latent_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, input_dim)
                )

            def forward(self, x):
                z = self.encoder(x)
                return z, self.decoder(z)

        # ae = AE().cuda()
        ae = AE()
        optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
        loss_fn = torch.nn.MSELoss()
        # X_tensor = torch.tensor(all_embeddings, dtype=torch.float32).cuda()
        X_tensor = torch.tensor(all_embeddings, dtype=torch.float32)

        for epoch in range(ae_epochs):
            ae.train()
            z, recon = ae(X_tensor)
            loss = loss_fn(recon, X_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        ae.eval()
        # reduce_fn = lambda X: ae.encoder(torch.tensor(X, dtype=torch.float32).cuda()).detach().cpu().numpy()
        reduce_fn = lambda X: ae.encoder(torch.tensor(X, dtype=torch.float32)).detach().cpu().numpy()
    else:
        raise ValueError("Reduction method must be 'pca' or 'ae'.")

    print("Building feature matrix...")
    X, y = [], []
    for movie, chars in fasttext_embedding_dictionary.items():
        for char, data in chars.items():
            all_sentences = []
            for t in sentence_types:
                all_sentences.extend(data.get(t, []))

            if not all_sentences:
                continue

            reduced = reduce_fn(np.vstack(all_sentences))
            avg_vector = reduced.mean(axis=0)
            X.append(avg_vector)

            if "rating" in structured_data_full[movie][char] and len(structured_data_full[movie][char]["rating"]) >= max(moral_trait_indices) + 1:
                y.append([structured_data_full[movie][char]["rating"][i] for i in moral_trait_indices])

    X = np.array(X)
    y = np.array(y)
    print(f"Final shape: X = {X.shape}, y = {y.shape}")

    print(f"Training {model_type.upper()} model...")
    if model_type == "ridge":
        model = Ridge()
    elif model_type == "mlp":
        mlp_model = MLPRegressor(
            hidden_layer_sizes=(32, ),
            max_iter=2000,
            early_stopping=True,
            learning_rate='adaptive',
            random_state=42,
            alpha=0.01
        )

        model = make_pipeline(StandardScaler(), mlp_model)  # scale features
    else:
        raise ValueError("Invalid model type.")

    model.fit(X, y)
    y_pred = model.predict(X)

    r2 = r2_score(y, y_pred, multioutput='raw_values')
    rmse = np.sqrt(mean_squared_error(y, y_pred, multioutput='raw_values'))

    print("\n Moral Trait Prediction Results:")
    results = {}
    for i, trait_idx in enumerate(moral_trait_indices):
        trait_name = trait_index_to_name.get(trait_idx, f"Trait {trait_idx}")
        print(f"{trait_name:<35} | RÂ² = {r2[i]:.3f} | RMSE = {rmse[i]:.3f}")
        results[trait_name] = {"R2": r2[i], "RMSE": rmse[i]}
    return results


In [4]:
import pandas as pd

def evaluate_all_models_and_save(
    fasttext_embedding_dictionary,
    structured_data_full,
    trait_index_to_name,
    moral_trait_indices,
    filename="..//results//baseline_2_moral_trait_evaluation.xlsx"
):
    results_dict = {}

    for reduction_method in ["pca", "ae"]:
        for model_type in ["ridge", "mlp"]:
            key = f"{reduction_method.upper()} + {model_type.upper()}"
            print(f"\nRunning: {key}")
            result = run_moral_trait_prediction(
                fasttext_embedding_dictionary,
                structured_data_full,
                trait_index_to_name,
                moral_trait_indices,
                reduction_method=reduction_method,
                model_type=model_type
            )
            for trait_name in result:
                if trait_name not in results_dict:
                    results_dict[trait_name] = {}
                results_dict[trait_name][f"{key} RÂ²"] = result[trait_name]["R2"]
                results_dict[trait_name][f"{key} RMSE"] = result[trait_name]["RMSE"]

    # Convert to DataFrame and save to Excel
    df = pd.DataFrame.from_dict(results_dict, orient="index")
    df.to_excel(filename)
    print(f"\n Results saved to {filename}")


In [12]:
evaluate_all_models_and_save(
    fasttext_embedding_dictionary,
    structured_data_full,
    trait_index_to_name,
    moral_trait_indices,
    filename="..//results//baseline_2_moral_trait_evaluation.xlsx"
)


Running: PCA + RIDGE
ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training PCA reducer...
Building feature matrix...
Final shape: X = (201, 20), y = (201, 31)
Training RIDGE model...

 Moral Trait Prediction Results:
cunningâ€“honorable                   | RÂ² = 0.022 | RMSE = 21.206
ferociousâ€“pacifist                  | RÂ² = 0.036 | RMSE = 20.919
forgivingâ€“vengeful                  | RÂ² = 0.050 | RMSE = 16.945
loyalâ€“traitorous                    | RÂ² = 0.010 | RMSE = 18.889
rudeâ€“respectful                     | RÂ² = 0.042 | RMSE = 15.396
arrogantâ€“humble                     | RÂ² = 0.029 | RMSE = 21.854
heroicâ€“villainous                   | RÂ² = 0.034 | RMSE = 17.644
mischievousâ€“well-behaved            | RÂ² = 0.053 | RMSE = 14.039
confidentâ€“insecure                  | RÂ² = 0.056 | RMSE = 17.969
debasedâ€“purity                      | RÂ² = 0.031 | RMSE = 16.191
selfishâ€“altruistic                  | RÂ² = 0.012 | RMSE = 13.006
angelicâ€“demonic              



In [40]:
run_moral_trait_prediction(
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    reduction_method="pca",  # or "ae"
    model_type="ridge",      # or "mlp"
    latent_dim=20
)

ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training PCA reducer...
Building feature matrix...
Final shape: X = (201, 20), y = (201, 35)
Training RIDGE model...

 Moral Trait Prediction Results:
cunningâ€“honorable                   | RÂ² = 0.022 | RMSE = 21.206
ferociousâ€“pacifist                  | RÂ² = 0.036 | RMSE = 20.919
forgivingâ€“vengeful                  | RÂ² = 0.050 | RMSE = 16.945
loyalâ€“traitorous                    | RÂ² = 0.010 | RMSE = 18.889
rudeâ€“respectful                     | RÂ² = 0.042 | RMSE = 15.396
arrogantâ€“humble                     | RÂ² = 0.029 | RMSE = 21.854
heroicâ€“villainous                   | RÂ² = 0.034 | RMSE = 17.644
mischievousâ€“well-behaved            | RÂ² = 0.053 | RMSE = 14.039
confidentâ€“insecure                  | RÂ² = 0.056 | RMSE = 17.969
selfishâ€“altruistic                  | RÂ² = 0.012 | RMSE = 13.006
angelicâ€“demonic                     | RÂ² = 0.117 | RMSE = 18.286
cruelâ€“kind                          | RÂ² = 0.065 |

In [41]:
run_moral_trait_prediction(
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    reduction_method="pca",  # or "ae"
    model_type="mlp",      # or "mlp"
    latent_dim=20
)

ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training PCA reducer...
Building feature matrix...
Final shape: X = (201, 20), y = (201, 35)
Training MLP model...

 Moral Trait Prediction Results:
cunningâ€“honorable                   | RÂ² = -0.107 | RMSE = 22.559
ferociousâ€“pacifist                  | RÂ² = -0.087 | RMSE = 22.218
forgivingâ€“vengeful                  | RÂ² = -0.217 | RMSE = 19.183
loyalâ€“traitorous                    | RÂ² = -0.159 | RMSE = 20.442
rudeâ€“respectful                     | RÂ² = 0.005 | RMSE = 15.694
arrogantâ€“humble                     | RÂ² = -0.004 | RMSE = 22.233
heroicâ€“villainous                   | RÂ² = 0.034 | RMSE = 17.639
mischievousâ€“well-behaved            | RÂ² = -0.956 | RMSE = 20.173
confidentâ€“insecure                  | RÂ² = -0.210 | RMSE = 20.347
selfishâ€“altruistic                  | RÂ² = -1.155 | RMSE = 19.213
angelicâ€“demonic                     | RÂ² = -0.369 | RMSE = 22.772
cruelâ€“kind                          | RÂ² = 

In [44]:
run_moral_trait_prediction(
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    reduction_method="ae",  # or "ae"
    model_type="ridge",      # or "mlp"
    latent_dim=20
)

ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training AE reducer...
Building feature matrix...
Final shape: X = (201, 20), y = (201, 35)
Training RIDGE model...

 Moral Trait Prediction Results:
cunningâ€“honorable                   | RÂ² = 0.008 | RMSE = 21.349
ferociousâ€“pacifist                  | RÂ² = 0.017 | RMSE = 21.134
forgivingâ€“vengeful                  | RÂ² = 0.037 | RMSE = 17.067
loyalâ€“traitorous                    | RÂ² = 0.004 | RMSE = 18.952
rudeâ€“respectful                     | RÂ² = 0.029 | RMSE = 15.501
arrogantâ€“humble                     | RÂ² = 0.011 | RMSE = 22.059
heroicâ€“villainous                   | RÂ² = 0.017 | RMSE = 17.798
mischievousâ€“well-behaved            | RÂ² = 0.033 | RMSE = 14.186
confidentâ€“insecure                  | RÂ² = 0.039 | RMSE = 18.128
selfishâ€“altruistic                  | RÂ² = 0.007 | RMSE = 13.045
angelicâ€“demonic                     | RÂ² = 0.087 | RMSE = 18.593
cruelâ€“kind                          | RÂ² = 0.047 | 

In [45]:
run_moral_trait_prediction(
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    reduction_method="ae",  # or "ae"
    model_type="mlp",      # or "mlp"
    latent_dim=20
)

ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training AE reducer...
Building feature matrix...
Final shape: X = (201, 20), y = (201, 35)
Training MLP model...

 Moral Trait Prediction Results:
cunningâ€“honorable                   | RÂ² = -0.178 | RMSE = 23.265
ferociousâ€“pacifist                  | RÂ² = -0.046 | RMSE = 21.795
forgivingâ€“vengeful                  | RÂ² = -0.263 | RMSE = 19.544
loyalâ€“traitorous                    | RÂ² = -0.225 | RMSE = 21.019
rudeâ€“respectful                     | RÂ² = -0.037 | RMSE = 16.019
arrogantâ€“humble                     | RÂ² = 0.003 | RMSE = 22.147
heroicâ€“villainous                   | RÂ² = 0.032 | RMSE = 17.661
mischievousâ€“well-behaved            | RÂ² = -0.636 | RMSE = 18.450
confidentâ€“insecure                  | RÂ² = -0.129 | RMSE = 19.651
selfishâ€“altruistic                  | RÂ² = -0.761 | RMSE = 17.366
angelicâ€“demonic                     | RÂ² = 0.020 | RMSE = 19.266
cruelâ€“kind                          | RÂ² = -0



## What does the model predict for iconic characters?

In [13]:
def run_moral_trait_prediction_with_model(
    fasttext_embedding_dictionary,
    structured_data_full,
    trait_index_to_name,
    moral_trait_indices,
    sentence_types=["moral", "non_moral", "action", "adj"],
    reduction_method="pca",
    model_type="ridge",
    latent_dim=20,
    ae_epochs=100
):
    import numpy as np
    from sklearn.decomposition import PCA
    from sklearn.linear_model import Ridge
    from sklearn.neural_network import MLPRegressor
    from sklearn.metrics import r2_score, mean_squared_error
    import torch
    import torch.nn as nn
    from tqdm import tqdm
    from sklearn.pipeline import make_pipeline
    from sklearn.preprocessing import StandardScaler

    print("ðŸ“¦ Collecting all sentence embeddings...")
    all_embeddings = []
    for movie in fasttext_embedding_dictionary:
        for char in fasttext_embedding_dictionary[movie]:
            for t in sentence_types:
                all_embeddings.extend(fasttext_embedding_dictionary[movie][char].get(t, []))
    all_embeddings = np.array(all_embeddings)

    print(f"ðŸ”§ Training {reduction_method.upper()} reducer...")
    if reduction_method == "pca":
        reducer = PCA(n_components=latent_dim)
        reducer.fit(all_embeddings)
        reduce_fn = lambda X: reducer.transform(X)
    elif reduction_method == "ae":
        class AE(nn.Module):
            def __init__(self, input_dim=384, latent_dim=latent_dim):
                super().__init__()
                self.encoder = nn.Sequential(
                    nn.Linear(input_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, latent_dim)
                )
                self.decoder = nn.Sequential(
                    nn.Linear(latent_dim, 128),
                    nn.ReLU(),
                    nn.Linear(128, input_dim)
                )

            def forward(self, x):
                z = self.encoder(x)
                return z, self.decoder(z)

        ae = AE()
        optimizer = torch.optim.Adam(ae.parameters(), lr=1e-3)
        loss_fn = torch.nn.MSELoss()
        X_tensor = torch.tensor(all_embeddings, dtype=torch.float32)
        for epoch in range(ae_epochs):
            ae.train()
            z, recon = ae(X_tensor)
            loss = loss_fn(recon, X_tensor)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        ae.eval()
        reduce_fn = lambda X: ae.encoder(torch.tensor(X, dtype=torch.float32)).detach().cpu().numpy()
    else:
        raise ValueError("Invalid reducer.")

    print("Building feature matrix...")
    X, y, movie_char_pairs = [], [], []
    for movie, chars in fasttext_embedding_dictionary.items():
        for char, data in chars.items():
            all_sentences = []
            for t in sentence_types:
                all_sentences.extend(data.get(t, []))
            if not all_sentences:
                continue
            reduced = reduce_fn(np.vstack(all_sentences))
            avg_vector = reduced.mean(axis=0)
            X.append(avg_vector)
            movie_char_pairs.append((movie, char))
            if "rating" in structured_data_full[movie][char] and len(structured_data_full[movie][char]["rating"]) >= max(moral_trait_indices) + 1:
                y.append([structured_data_full[movie][char]["rating"][i] for i in moral_trait_indices])

    X = np.array(X)
    y = np.array(y)

    print(f"Training {model_type.upper()} model...")
    if model_type == "ridge":
        model = Ridge()
    elif model_type == "mlp":
        mlp_model = MLPRegressor(
            hidden_layer_sizes=(32,),
            max_iter=2000,
            early_stopping=True,
            learning_rate='adaptive',
            random_state=42,
            alpha=0.01
        )
        model = make_pipeline(StandardScaler(), mlp_model)
    else:
        raise ValueError("Invalid model type.")

    model.fit(X, y)
    return model, reduce_fn, movie_char_pairs


In [15]:
# Re-defining the function since kernel was reset
def extract_selected_character_predictions(
    fasttext_embedding_dictionary,
    structured_data_full,
    trait_index_to_name,
    moral_trait_indices,
    model,
    reduce_fn,
    sentence_types=["moral", "non_moral", "action", "adj"],
    selected_characters=None
):
    result_rows = []

    for movie, chars in fasttext_embedding_dictionary.items():
        for char, data in chars.items():
            if selected_characters and (movie not in selected_characters or char not in selected_characters[movie]):
                continue

            all_sentences = []
            for t in sentence_types:
                all_sentences.extend(data.get(t, []))

            if not all_sentences:
                continue

            reduced = reduce_fn(np.vstack(all_sentences))
            avg_vector = reduced.mean(axis=0)

            if "rating" not in structured_data_full[movie][char]:
                continue

            char_rating = structured_data_full[movie][char]["rating"]
            if len(char_rating) < max(moral_trait_indices) + 1:
                continue

            true_vector = [char_rating[i] for i in moral_trait_indices]
            pred_vector = model.predict([avg_vector])[0]

            for idx, trait_idx in enumerate(moral_trait_indices):
                trait_name = trait_index_to_name.get(trait_idx, f"Trait {trait_idx}")
                result_rows.append({
                    "Character": f"{movie}_{char}",
                    "Trait": trait_name,
                    "Ground Truth": true_vector[idx],
                    "Predicted Rating": pred_vector[idx]
                })

    selected_results_df = pd.DataFrame(result_rows)
    return selected_results_df


In [16]:
selected_characters = {
    "The Dark Knight": ["THE JOKER"],
    "Toy Story": ["WOODY"],
    "The Shawshank Redemption": ["RED"],
    "Star Wars: Episode IV - A New Hope": ["LUKE"]
}

In [18]:

model, reduce_fn, movie_char_pairs = run_moral_trait_prediction_with_model(
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    reduction_method="pca",
    model_type="ridge"
)

df_preds = extract_selected_character_predictions(
    model=model,
    reduce_fn=reduce_fn,
    selected_characters=selected_characters,
    fasttext_embedding_dictionary=fasttext_embedding_dictionary,
    structured_data_full=structured_data_full,
    trait_index_to_name=trait_index_to_name,
    moral_trait_indices=moral_trait_indices,
    sentence_types=["moral", "non_moral", "action", "adj"]
)

joker_result_df = df_preds[df_preds["Character"] == "The Dark Knight_THE JOKER"]
woody_result_df = df_preds[df_preds["Character"] == "Toy Story_WOODY"]
red_result_df = df_preds[df_preds["Character"] == "The Shawshank Redemption_RED"]
luke_result_df = df_preds[df_preds["Character"] == "Star Wars: Episode IV - A New Hope_LUKE"]

joker_result_df = joker_result_df.drop(columns=["Character"])
woody_result_df = woody_result_df.drop(columns=["Character"])
red_result_df = red_result_df.drop(columns=["Character"])
luke_result_df = luke_result_df.drop(columns=["Character"])


joker_result_df.to_excel("..//results//baseline_2_joker_result.xlsx", index=False)
woody_result_df.to_excel("..//results//baseline_2_woody_result.xlsx", index=False)
red_result_df.to_excel("..//results//baseline_2_red_result.xlsx", index=False)
luke_result_df.to_excel("..//results//baseline_2_luke_result.xlsx", index=False)

ðŸ“¦ Collecting all sentence embeddings...
ðŸ”§ Training PCA reducer...
Building feature matrix...
Training RIDGE model...
