In [1]:

import torch
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("./model/mt0-small")
model = AutoModelForSeq2SeqLM.from_pretrained("./model/mt0-small").encoder.eval()

df = pd.read_csv("stories.csv")
df["affiliation"] = df["affiliation"] == "hero"


with torch.no_grad():
	df["embedding"] = list(model(**tokenizer(df.story.to_list(), truncation=True, return_tensors="pt", padding="max_length", max_length=256)).last_hidden_state.numpy().reshape(len(df), -1))

In [2]:
df

Unnamed: 0,name,affiliation,story,embedding
0,Elena,True,"In a quiet village nestled between mountains, ...","[0.0134226, 0.09059348, 0.24420658, 0.093445, ..."
1,Victor,False,"Once a respected scholar, Victor became obsess...","[-0.13793102, -0.08190563, 0.22876039, 0.07411..."
2,Mira,True,"When the river dried up, leaving the town of G...","[-0.14537127, -0.07793305, 0.27147087, 0.06156..."
3,Lucian,False,"A charismatic nobleman, Lucian was once belove...","[-0.06226392, -0.09527724, 0.18944606, 0.07600..."
4,Sofia,True,"In a bustling port city, a young seamstress na...","[-0.04897955, 0.043331318, 0.25325763, 0.05164..."
...,...,...,...,...
495,Omar,False,"The city was running out of green spaces, but ...","[-0.28018677, -0.16312619, 0.19061351, 0.04516..."
496,Fayza,False,"The rare historical artifact was stolen, and F...","[-0.27232927, -0.16289006, 0.2347683, 0.065352..."
497,Ibrahim,False,"The town was devastated by a flood, but Ibrahi...","[-0.27734616, -0.13257247, 0.22166508, 0.06070..."
498,Yasmin,False,"The city’s air quality was deteriorating, and ...","[-0.2838643, -0.15304893, 0.19749913, 0.073238..."


In [3]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

X = np.vstack(df["embedding"].values)
y = df["affiliation"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss')
model.fit(X_train, y_train)

# Predict on test data
y_pred = model.predict(X_test)

# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")


Parameters: { "use_label_encoder" } are not used.



Model Accuracy: 0.95


In [5]:
model.predict(X_test)

array([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0])