In [1]:
import pandas as pd
import sys
import os
from collections import Counter

# Add src to path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

from utils import ResourceTracker, set_global_seed, ExperimentLogger
from nlp_models import load_ner_pipeline, extract_countries_bert
from llm_client import GeminiClient

# Setup
logger = ExperimentLogger()
set_global_seed(42)

Logging experiments to: reports/experiment_20260107_185211.json
Global seed set to: 42


## 1. Load Data
We load the processed speeches.

In [2]:
# Assuming data is in root/data
try:
    df = pd.read_csv('../data/putins_talks_prepared.csv')
    # Convert date for sorting/filtering
    df['date'] = pd.to_datetime(df['date'])
    print(f"Loaded {len(df)} rows.")
except FileNotFoundError:
    print("Data file not found. Please ensure data/putins_talks_prepared.csv exists.")
df.head(2)

Loaded 5079 rows.


Unnamed: 0,date,persons,transcript_unfiltered,kremlin_id,place,title,teaser,tags,transcript_filtered,wordlist,grouped_tages
0,2012-05-07 12:20:00,[],The ceremony opened with the Russian State Fla...,15224.0,"The Kremlin, Moscow",Vladimir Putin inaugurated as President of Russia,The inauguration ceremony took place in the Gr...,[],"Citizens of Russia, friends, The inauguration ...","['citizen', 'of', 'Russia', ',', 'friend', ','...",[]
1,2012-05-08 16:00:00,[],State Duma deputies approved Dmitry Medvedev a...,15266.0,Moscow,State Duma plenary session,Vladimir Putin presented the candidacy of Dmit...,['Civil service'],"Mr Naryshkin, deputies of the Russian parliame...","['Mr', 'Naryshkin', ',', 'deputy', 'of', 'the'...",['State_Governance_Public_Service']


## 2. BERT Extraction (Traditional Pipeline)
We extract all mentions (duplicates included) and normalize them using our helper functions.


In [3]:
# Load model (cached)
ner_pipe = load_ner_pipeline()

# Run on a random sample first
sample_size = 10
df_sample = df.sample(n=min(sample_size, len(df)), random_state=42).copy()

print(f"Processing {len(df_sample)} speeches with BERT...")

with ResourceTracker("BERT Extraction") as tracker:
    # Apply extraction row by row
    df_sample['bert_countries'] = df_sample['transcript_filtered'].apply(
        lambda x: extract_countries_bert(x, ner_pipe)
    )

logger.log_operation("BERT Extraction", tracker.duration, tracker.peak_memory_mb)

# Show an example of what was found
print("Example Output (List of countries found):")
print(df_sample['bert_countries'].iloc[0])


Loading NER model: dslim/bert-base-NER on device 0...


Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Processing 10 speeches with BERT...
[BERT Extraction] Finished.
   Duration: 6.8977 seconds
   Peak Memory: 14.72 MB
Example Output (List of countries found):
['Syria', 'Turkey', 'Iran', 'Syria', 'Russia', 'Turkey', 'Iran', 'Syria', 'Syria', 'Syria', 'Iran', 'Turkey', 'Syria', 'Syria']


## 3. Modern Pipeline (LLM) vs Manual vs BERT

In [4]:
# Prepare file for pasting into Gemini Chat
cleared_df_sample = df_sample[['date', 'transcript_filtered']].copy()
cleared_df_sample.to_csv('../data/samples/ner_gemini_input_sample.csv', index=False)

In [5]:
# Prepare for manual annotation and pasting Gemini results
df_sample['manual_countries'] = "" # Empty column to fill
df_sample = df_sample[['date', 'transcript_filtered', 'bert_countries', 'manual_countries']]

# Save to CSV
output_path = '../data/samples/ner_validation_sample.csv'
os.makedirs(os.path.dirname(output_path), exist_ok=True)
df_sample.to_csv(output_path, index=False)

print(f"Saved validation sample to {output_path})")

Saved validation sample to ../data/samples/ner_validation_sample.csv)


## 4. Evaluation Logic

In [5]:
# Reload annotated data
try:
    annotated_df = pd.read_csv('../data/samples/ner_validation_sample_annotated.csv')
    
    def parse_list_like(x):
        """Return a list preserving duplicates for inputs that may be:
           - a list
           - a string like "['A', 'B']"
           - a comma-separated string "A, B, A"
           - NaN/empty -> []
        """
        if pd.isna(x):
            return []
        if isinstance(x, list):
            return x
        s = str(x).strip()
        if not s:
            return []
        # try literal_eval for python-list-like strings
        try:
            val = ast.literal_eval(s)
            if isinstance(val, list):
                return [str(i).strip() for i in val if str(i).strip()]
        except Exception:
            pass
        # fallback: comma-separated
        return [part.strip() for part in s.split(',') if part.strip()]

    def calculate_metrics(pred_list, true_str):
        true_list = parse_list_like(true_str)
        pred_list = parse_list_like(pred_list)
    
        true_counter = Counter(true_list)
        pred_counter = Counter(pred_list)
    
        # true positives = sum of minimum counts per item
        tp = sum((true_counter & pred_counter).values())
        total_pred = sum(pred_counter.values())
        total_true = sum(true_counter.values())
    
        fp = total_pred - tp
        fn = total_true - tp
    
        # If both empty, treat as perfect match
        if total_pred == 0 and total_true == 0:
            return 1.0, 1.0, 1.0
    
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0.0
        f1 = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0.0
        return precision, recall, f1

    # Calculate scores
    llm_columns = ["bert_countries", "gemini_countries_1", "gemini_countries_2", "gemini_countries_3"]
    metrics_rows = []
    for col in llm_columns:
        if col not in annotated_df.columns:
            continue
        metrics = annotated_df.apply(lambda row: calculate_metrics(row[col], row.get('manual_countries', '')), axis=1)
        precision_mean = metrics.apply(lambda x: x[0]).mean()
        recall_mean = metrics.apply(lambda x: x[1]).mean()
        f1_mean = metrics.apply(lambda x: x[2]).mean()
        metrics_rows.append({'method': col, 'precision': precision_mean, 'recall': recall_mean, 'f1': f1_mean})
    
    metrics_df = pd.DataFrame(metrics_rows, columns=['method', 'precision', 'recall', 'f1'])
    print(metrics_df)

except Exception as e:
    print(f"Could not load/parse annotated file: {e}")

               method  precision    recall        f1
0      bert_countries   0.831232  0.781806  0.802073
1  gemini_countries_1   0.979167  0.994845  0.986625
2  gemini_countries_2   0.822648  0.768397  0.770107
3  gemini_countries_3   0.974196  0.923785  0.935087
