In [1]:
import pandas as pd
import sys
import os
import json
from datetime import datetime

# Add src to path to import local modules
sys.path.append(os.path.abspath(os.path.join('..', 'src')))

# Import your custom modules
from utils import ResourceTracker, set_global_seed, ExperimentLogger
from nlp_models import (
    load_zero_shot_pipeline, 
    extract_context_sentences, 
    classify_sentences_batch, 
    get_hf_model_metadata,
    get_accompanying_terms
)
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import numpy as np
from ast import literal_eval

# Initialize Logging and Seeding
logger = ExperimentLogger(log_dir="../reports")
set_global_seed(42)

Logging experiments to: ../reports/experiment_20260107_232326.json
Global seed set to: 42


# 1. Finding top countries

In [2]:
# Finding top countries
df = pd.read_csv("../data/processed/putins_talks_with_countries.csv")
all_countries = df['extracted_countries'].apply(literal_eval).explode().dropna()
country_counts = all_countries[all_countries != 'Russia'].value_counts()
country_counts

extracted_countries
Ukraine           3838
China             2964
Syria             2384
United States     1646
Turkey            1468
                  ... 
Greenland            1
Virgin Islands       1
Seychelles           1
Samoa                1
Bhutan               1
Name: count, Length: 170, dtype: int64

In [3]:
country_counts.to_csv("../data/processed/country_counts.csv")

In [4]:
top_countries = country_counts.head(5).index.tolist()
top_countries

['Ukraine', 'China', 'Syria', 'United States', 'Turkey']

# 2. Extracting sentences mentioning chosen countries

In [5]:
# Create target terms
target_terms = {
    'Poland': ['Poland', 'Polish', 'Poles'],
    'Ukraine': ['Ukraine', 'Ukrainian', 'Ukrainians'],
    'United States': ['United States', 'USA', 'American', 'Americans', 'U.S.'],
    'China': ['China', 'Chinese'],
    'Syria': ['Syria', 'Syrian'],
    'Russia': ['Russia', 'Russian', 'Russians', 'Soviets', 'Soviet Union', 'USSR', 'Soviet', 'Russian Federation']
}

In [None]:
top_countries.append('Poland')
for key, terms in target_terms.items():
    print(f"Processing country: {key}")
    # Extract context sentences
    with ResourceTracker("Sentence Extraction") as rt:
        extracted_data = extract_context_sentences(
            texts=df['transcript_filtered'].tolist(), 
            dates=df['date'].tolist(),
            target_terms=terms,
            spacy_model_name="en_core_web_sm"
        )
    logger.log_operation("Sentence_Extraction", rt.duration, rt.peak_memory_mb, {"count": len(extracted_data)})
    df_sentences = pd.DataFrame(extracted_data)
    print(f"Found {len(df_sentences)} relevant sentences.")
    df_sentences.to_csv(f"../data/sentences/sentences_{key}.csv", index=False)
    df_sentences.head()

Processing country: Poland


# 3. Finding accompanying terms

In [None]:
for key, terms in target_terms.items():
    print(f"Processing country: {key}")
    df_sentences = pd.read_csv(f"../data/sentences/sentences_{key}.csv")
    for sentence in df_sentences['sentence']:
        accompanying_terms = get_accompanying_terms(
            sentence, 
            target_terms=terms
        )
        df_sentences.at[df_sentences['sentence'] == sentence, 'accompanying_terms'] = str(accompanying_terms)
    df_sentences.to_csv(f"../data/sentences/sentences_{key}_with_terms.csv", index=False)

# 4. Classifying sentences using zero-shot classification

In [None]:
# Configuration
LABELS_BASE = ["partner", "enemy", "neutral"]
LABELS_SYNONYMS = ["ally", "adversary", "unbiased"]

# Load Model
model_name = "facebook/bart-large-mnli"
print(f"Loading model: {model_name}...")
zs_pipeline = load_zero_shot_pipeline(model_name)

# Log Model Metadata for Report
meta = get_hf_model_metadata(zs_pipeline)
print(f"Model Architecture: {meta['model_architecture']}")
print(f"Commit Hash: {meta['commit_hash']}")

In [None]:
for country in top_countries:
    print(f"Classifying sentences for country: {country}")
    df_sentences = pd.read_csv(f"../data/sentences/sentences_{country}_with_terms.csv")
    
    if country == 'Russia':
        LABELS_BASE = ["victim", "leader", "defender"]
        LABELS_SYNONYMS = ["casualty", "commander", "protector"]
    
    # Classify sentences in batches
    print("Running Zero-Shot (Base Labels)...")
    with ResourceTracker("ZeroShot_BaseLabels") as rt:
        results_base = classify_sentences_batch(
            zs_pipeline, 
            df_sentences['sentence'].tolist(), 
            LABELS_BASE
        )
    
    logger.log_operation(
        name="ZeroShot_Base",
        duration=rt.duration,
        memory_mb=rt.peak_memory_mb,
        metrics={"model": model_name, "labels": LABELS_BASE, "hash": meta['commit_hash']}
    )
    
    df_sentences['zs_base_label'] = [r['top_label'] for r in results_base]
    df_sentences['zs_base_score'] = [r['top_score'] for r in results_base]
    
    print("Running Zero-Shot (Synonym Labels)...")
    with ResourceTracker("ZeroShot_SynonymLabels") as rt:
        results_syn = classify_sentences_batch(zs_pipeline, sentences, LABELS_SYNONYMS)
    
    logger.log_operation(
        name="ZeroShot_Synonyms",
        duration=rt.duration,
        memory_mb=rt.peak_memory_mb,
        metrics={"model": model_name, "labels": LABELS_SYNONYMS}
    )
    
    df_sentences['zs_syn_label'] = [r['top_label'] for r in results_syn]
    df_sentences['zs_syn_score'] = [r['top_score'] for r in results_syn]
    
    # Save classified results
    df_sentences.to_csv(f"../data/sentences/sentences_{country}_classified.csv", index=False)
    print(f"Saved classified sentences for {country}.")

# 5. Classifying sentences using Gemini and human annotation

# 6. Classification evaluation

# 7. Aggregating and saving classification counts

In [None]:
def aggregate_counts(df, method_col, method_name):
    """Counts occurrences of each label per country for a given method."""
    # Group by Country (found_term) and Label
    counts = df.groupby(['found_term', method_col]).size().reset_index(name='count')
    counts['method'] = method_name
    counts.rename(columns={method_col: 'label'}, inplace=True)
    return counts

In [None]:
# List of method columns to aggregate
methods_to_count = [
    ('zs_base_label', 'ZeroShot_Base'),
    ('zs_syn_label', 'ZeroShot_Synonym'),
    ('ai_prompt_a', 'AI_Prompt_A'),
    ('ai_prompt_b', 'AI_Prompt_B')
]

all_counts = []

for col, name in methods_to_count:
    if col in df_sentences.columns:
        all_counts.append(aggregate_counts(df_sentences, col, name))

# Combine all into one DataFrame
if all_counts:
    df_counts = pd.concat(all_counts, ignore_index=True)
    
    # Reorder for clarity: Country, Method, Label, Count
    df_counts = df_counts[['found_term', 'method', 'label', 'count']]
    
    print("Summary of Classification Counts:")
    display(df_counts.head(15))
    
    # Save
    df_counts.to_csv("../data/processed/country_label_counts.csv", index=False)
else:
    print("No classification columns found to aggregate.")