In [1]:
import torch
import pandas as pd
import numpy as np
from transformers import AutoModel, AutoTokenizer
from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_auc_score
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
from src.models.predictor import TrollPredictor

from src.data_tools.czech_data_tools import load_czech_media_data

In [None]:
# Predictor from Checkpoint
# predictor = TrollPredictor(
#     model_path= "checkpoints/best_model.pt",
#     comments_per_user=10,
#     max_length=64
# )

# Predictor from Hugging Face pretrained model
predictor = TrollPredictor(
    model_name = "ufal/robeczech-base",
    comments_per_user=10,
    max_length=96
)
    

In [3]:
from pathlib import Path

DATA_DIR = Path('data')
czech_comments = load_czech_media_data(str(DATA_DIR / 'MediaSource'))
print(f"Loaded {len(czech_comments)} comments from {czech_comments['author'].nunique()} unique authors")

Loading files:   0%|                                                                            | 0/124 [00:00<?, ?it/s]

Loading files: 100%|██████████████████████████████████████████████████████████████████| 124/124 [00:10<00:00, 11.53it/s]


Loaded 845764 comments from 66590 unique authors


In [43]:
czech_comments.columns

Index(['text', 'raw_text', 'author', 'timestamp', 'article_title', 'url',
       'article_id', 'sentiment'],
      dtype='object')

In [None]:
# Prepare Account Embeddings
account_embeddings = []
account_labels = []

authors = list(czech_comments["author"].unique())[:10000]

# Filter authors with at least 10 comments
authors = [author for author, count in czech_comments["author"].value_counts().items() if count >= 10]

# Ensure the model is on the correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
predictor.model.to(device)

for author in authors:
    group = czech_comments[czech_comments["author"] == author]


    comments = group["text"].tolist()

    # Pad or trim to predictor.comments_per_user:
    if len(comments) >= predictor.comments_per_user:
        comments = comments[:predictor.comments_per_user]
    else:
        comments = (comments * ((predictor.comments_per_user // len(comments)) + 1))[:predictor.comments_per_user]

    # Tokenize
    encoded = predictor.tokenizer(comments, padding=True, truncation=True, max_length=predictor.max_length, return_tensors="pt")

    # Move input tensors to the same device as the model
    encoded = {key: tensor.to(device) for key, tensor in encoded.items()}

    with torch.no_grad():
        # Pass the tokenized inputs to the TrollDetector model
        outputs = predictor.model(
            input_ids=encoded["input_ids"],
            attention_mask=encoded["attention_mask"],
            tweets_per_account=predictor.comments_per_user
        )
        account_emb = outputs["account_embedding"].cpu().numpy()
        

    # Aggregate comments into account embedding
    account_embeddings.append(account_emb)

account_embeddings = np.vstack(account_embeddings)
account_labels = np.array(account_labels) if account_labels else None

In [45]:
# Fit isolation forest
clf = IsolationForest(contamination=0.1, random_state=42)
clf.fit(account_embeddings)
scores = -clf.decision_function(account_embeddings)

In [None]:
# Evaluate
if account_labels is not None:
    print("ROC-AUC:", roc_auc_score(account_labels, scores))

# Save Anomaly Scores
pd.DataFrame({
    "author": authors,
    "anomaly_score": scores
}).to_csv("anomaly_scores.csv", index=False)


ValueError: All arrays must be of the same length

In [None]:
# Visualization
tsne = TSNE(n_components=2, random_state=42)
tsne_embs = tsne.fit_transform(account_embeddings)

plt.figure(figsize=(8, 6))
if account_labels is not None:
    plt.scatter(tsne_embs[:, 0], tsne_embs[:, 1], c=account_labels, cmap="coolwarm", alpha=0.7)
    plt.colorbar(label="Troll label")
else:
    plt.scatter(tsne_embs[:, 0], tsne_embs[:, 1], c=scores, cmap="viridis", alpha=0.7)
    plt.colorbar(label="Anomaly Score")
plt.title("t-SNE Visualization of Account Embeddings")
plt.show()

In [None]:
# --- Show Top Anomalous Authors ---
results_df = pd.DataFrame({"author": authors,
    "anomaly_score": scores
}).sort_values("anomaly_score", ascending=False)

# Filter authors with at least 5 comments
author_comment_counts = czech_comments["author"].value_counts()
authors_with_min_comments = author_comment_counts[author_comment_counts >= 5].index


# Filter results_df to include only authors with at least 5 comments
results_df = results_df[results_df["author"].isin(authors_with_min_comments)]

print("\nTop 10 most anomalous authors:")
print(results_df.head(10))

In [None]:
import matplotlib.pyplot as plt

# Plot the distribution of anomaly scores
plt.figure(figsize=(10, 6))
plt.hist(results_df["anomaly_score"], bins=50, color="skyblue", alpha=0.7, edgecolor="black")
plt.title("Distribution of Anomaly Scores", fontsize=16)
plt.xlabel("Anomaly Score", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
plt.grid(axis="y", linestyle="--", alpha=0.7)
plt.show()

In [None]:
# Show Sample Comments from Top Anomalous Authors
top_authors = results_df.head(10)["author"].tolist()

print("\nSample comments from top anomalous authors:\n")
for author in top_authors:
    author_comments = czech_comments[czech_comments["author"] == author]["text"].tolist()[:5]
    print(f"Author: {author}")
    for comment in author_comments:
        print(f" - {comment}")
    print("-" * 40)

In [None]:
# Show Sample Comments from Least Anomalous Authors 
least_anomalous_authors = results_df.tail(10)["author"].tolist()

print("\nSample comments from least anomalous authors:\n")
for author in least_anomalous_authors:
    author_comments = czech_comments[czech_comments["author"] == author]["text"].tolist()[:5]
    print(f"Author: {author}")
    for comment in author_comments:
        print(f" - {comment}")
    print("-" * 40)