In [None]:
# declare a list tasks whose products you want to use as inputs
upstream = None


In [9]:
# Parameters cell
product = None

In [1]:
import pandas as pd
import sys
from pathlib import Path
import logging
import json
from tqdm import tqdm

# Add project root to path
sys.path.append(str(Path.cwd().parent))

from src.data_tools.czech_data_tools import load_czech_media_data
from src.models.predictor import TrollPredictor
from src.analysis.benchmark import CzechBenchmark

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
# Initialize predictor with model
model_path = "./checkpoints/best_model.pt"

# Initialize predictor with model
predictor = TrollPredictor(
    model_path="../" + model_path,
    comments_per_user=5,
    max_length=96
)

INFO:src.models.predictor:Loaded model weights from .././checkpoints/best_model.pt


In [8]:
DATA_DIR = Path('../data')
czech_comments = load_czech_media_data(str(DATA_DIR / 'MediaSource'))
print(f"Loaded {len(czech_comments)} comments from {czech_comments['author'].nunique()} unique authors")

Loading files: 100%|██████████| 124/124 [00:11<00:00, 10.34it/s]


Loaded 845551 comments from 66578 unique authors


In [4]:
benchmark = CzechBenchmark()

# Create benchmark set if it doesn't exist
if not (Path("data/benchmark") / "benchmark_cases.json").exists():
    benchmark.create_benchmark_set(
        comments_df=czech_comments,
        n_authors=10,
        min_comments=5
    )

# List of authors you want to add
manual_authors = [
    "Pavel Hanzl",  # Very active commenter with strong opinions
    "Jitka Bártová",  # High troll confidence but does not seem too trolly to me
    "Roman Myška",   # Interesting case, dont think he is troll
    "Štěpán Malák", # Funny comment about Fiala
    "Roman Dostál",
    "Tomáš Volf",
    "Antonín Sova", # Agressively against Trump, Fico etc (dobry priklad 60% confidence)
    "Štěpán Malák", # Manually annotated as troll ---V
    "Jan Benda",
    "Jindra Macek",
    "Josef Fortelný",
    "Michal Musil",
    "Jan Trejbal",
    "Michal Antonín",
    "Gabi Muller",
    "Ivan Penzes",
    "Martin Ondík", # TROLLS UNTIL HERE ------
    "Radek Palán", # Manually annotated as non troll ---V
    "Michal Žák",
    "Michal Žemla" # NON TROLLS UNTIL HERE ------

]

# Add these authors to benchmark
benchmark.add_authors(manual_authors, czech_comments)

# Štěpán Malák, Štěpán Malák troll

INFO:src.analysis.benchmark:Created benchmark set with 50 authors
INFO:src.analysis.benchmark:Added 1659 comments from Pavel Hanzl
INFO:src.analysis.benchmark:Added 20 comments from Jitka Bártová
INFO:src.analysis.benchmark:Added 155 comments from Roman Myška
INFO:src.analysis.benchmark:Added 3 comments from Hosek Miroslav
INFO:src.analysis.benchmark:Added 8 comments from Roman Dostál
INFO:src.analysis.benchmark:Added 6 comments from Tomáš Volf
INFO:src.analysis.benchmark:Added 39 comments from Štěpán Malák
INFO:src.analysis.benchmark:Added 64 comments from Jan Benda
INFO:src.analysis.benchmark:Added 22 comments from Jindra Macek
INFO:src.analysis.benchmark:Added 40 comments from Josef Fortelný
INFO:src.analysis.benchmark:Added 37 comments from Michal Musil
INFO:src.analysis.benchmark:Added 64 comments from Jan Trejbal
INFO:src.analysis.benchmark:Added 99 comments from Michal Antonín
INFO:src.analysis.benchmark:Added 33 comments from Gabi Muller
INFO:src.analysis.benchmark:Added 25 com

In [5]:
# Run benchmark tests
results = benchmark.run_benchmark(predictor)

In [9]:
# Display Results
print("\nBenchmark Results Summary:")
print("-" * 40)
print(f"Total authors tested: {results['summary']['total_authors']}")
print(f"Total comments analyzed: {results['summary']['total_comments']}")
print(f"Troll predictions: {results['summary']['troll_predictions']}")

print("\nDetailed Results:")
print("-" * 40)
for author, data in results['predictions'].items():
    print(f"\nAuthor: {author}")
    print(f"Prediction: {data['prediction']}")
    print(f"Confidence: {data['confidence']:.2%}")


Benchmark Results Summary:
----------------------------------------
Total authors tested: 68
Total comments analyzed: 37162
Troll predictions: 41

Detailed Results:
----------------------------------------

Author: Jan Novák
Prediction: troll
Confidence: 99.35%

Author: Pavel Hanzl
Prediction: not_troll
Confidence: 99.84%

Author: Petr Novák
Prediction: not_troll
Confidence: 71.87%

Author: Pavel Novák
Prediction: troll
Confidence: 93.46%

Author: Martin Holý
Prediction: troll
Confidence: 97.36%

Author: Pavel Dvořák
Prediction: not_troll
Confidence: 58.29%

Author: Pavel Sladovník
Prediction: troll
Confidence: 61.58%

Author: Vladimír Mrkva
Prediction: troll
Confidence: 97.76%

Author: Lubomir Kvasnicka
Prediction: not_troll
Confidence: 96.59%

Author: Petr Dvořák
Prediction: troll
Confidence: 97.71%

Author: Martin Novák
Prediction: troll
Confidence: 87.75%

Author: Jiří Černohorský
Prediction: not_troll
Confidence: 96.85%

Author: Zdeněk Hübner
Prediction: troll
Confidence: 99.55%


In [14]:
from src.data_tools.czech_data_tools import extract_author_comments_simple

# Extract comments for a specific author
author = "Roman Dostál"
author_comments = extract_author_comments_simple(czech_comments, author)


Extracted 8 comments from Roman Dostál
Comments saved to: extracted_comments/Roman_Dostál_20250327_013903.json

Sample of comments:
--------------------------------------------------------------------------------
Teď daňový poplatník USA dost kouká kam šli jeho a “půjčené” peníze.Další výhra nad socialismem,děkujeme
--------------------------------------------------------------------------------
Máme ale 400 generálů.To musí každou zemi odstrašit.
--------------------------------------------------------------------------------
Robert MroczkowskiZkáza woke v USA.Ale to je přeci skvělá zpráva.Zas si občané USA budou moct vyndat vlajky a komunisté budou jen brblat na Novinkách.
--------------------------------------------------------------------------------
Proč proboha?Nás to všechny bavilo kde je propustnost FB.‍♂️No nic,tak zkusim Seznam ☝️
--------------------------------------------------------------------------------
Spoléhala na hlasy v hlavě-a najednou nic.
----------------------