In [3]:
!pip install -q anthropic openai pandas numpy tqdm datasets scikit-learn
!pip install google-generativeai together -q
!pip install cohere tenacity -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.5/357.5 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m83.5/83.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [6]:
import os
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
from google.colab import drive
from datetime import datetime
import anthropic
import openai
from openai import OpenAI
import google.generativeai as genai
import cohere
from together import Together
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tenacity import retry, wait_exponential, stop_after_attempt
from collections import Counter

In [5]:
drive.mount('/content/drive')


PROJECT_ROOT = "/content/drive/MyDrive/PhD/Courses/year_2/text_analytics/POLAR_SemEval2026"
DATA_DIR = f"{PROJECT_ROOT}/data"
OUTPUT_DIR = f"{PROJECT_ROOT}/tier1_output"
PHASE1_OUTPUT = f"{OUTPUT_DIR}/phase1_multitemp"
os.makedirs(PHASE1_OUTPUT, exist_ok=True)


Mounted at /content/drive


In [None]:
# API keys
os.environ["ANTHROPIC_API_KEY"] = ""
os.environ["OPENAI_API_KEY"] = ""
os.environ["DEEPSEEK_API_KEY"] = ""
os.environ["COHERE_API_KEY"] = ""
os.environ["TOGETHER_API_KEY"] = ""
# os.environ["GEMINI_API_KEY"] = xxxx

In [8]:
# Initialize clients
claude_client = anthropic.Anthropic()
openai_client = OpenAI()
deepseek_client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
gemini_client = gemini_client = genai.GenerativeModel('gemini-1.5-pro')
cohere_client = cohere.Client(os.environ["COHERE_API_KEY"])
together_client = Together()

In [9]:
# Languages
LANGUAGES = ['zho', 'tur', 'eng', 'arb', 'hin']
LANGUAGE_NAMES = {'zho': 'Chinese', 'tur': 'Turkish', 'eng': 'English', 'arb': 'Arabic', 'hin': 'Hindi'}

# Temperatures to test
TEMPERATURES = [0.0, 0.5, 1.0]

# System prompt
SYSTEM_PROMPT = """You are an expert at detecting online polarization.

POLARIZATION is content that creates sharp division into opposing groups with these characteristics:

POLARIZED content has:
- Us vs. Them framing: "our people" vs "those people", in-group vs out-group language
- Hostility or contempt toward a group (political, religious, racial, gender, etc.)
- Moral condemnation: portraying one side as evil/corrupt/dangerous
- Stereotyping or vilification of entire groups
- Dehumanizing language
- Zero-sum framing: if they win, we lose

NOT polarized:
- Factual reporting or neutral information
- Policy disagreement without hostility
- Criticism of specific actions/individuals without group-level attacks
- Personal opinions without divisive us-vs-them framing

Respond ONLY with JSON:
{
  "is_polarized": true/false,
  "confidence": 0.0-1.0,
  "reasoning": "brief explanation"
}"""

In [None]:
def call_model_at_temp(text, language_name, model_name, temperature):
    """Call model at specific temperature"""
    try:
        if model_name == 'DeepSeek':
            response = deepseek_client.chat.completions.create(
                model="deepseek-chat",
                temperature=temperature,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
                ]
            )
            response_text = response.choices[0].message.content.strip()
            if response_text.startswith('```'):
                lines = response_text.split('\n')[1:]
                if lines[-1].strip() == '```':
                    lines = lines[:-1]
                response_text = '\n'.join(lines).strip()
            response = json.loads(response_text)

        elif model_name == 'GPT-4o':
            response = openai_client.chat.completions.create(
                model="gpt-4o",
                temperature=temperature,
                response_format={"type": "json_object"},
                timeout=30.0,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
                ]
            )
            response_text = response.choices[0].message.content.strip()
            response = json.loads(response_text)

        elif model_name == 'Qwen2.5':
            response = together_client.chat.completions.create(
                model="Qwen/Qwen2.5-72B-Instruct-Turbo",
                temperature=temperature,
                messages=[
                    {"role": "system", "content": SYSTEM_PROMPT},
                    {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
                ]
            )
            response = json.loads(response.choices[0].message.content)

        # Extract prediction
        predicted_val = response.get('is_polarized', None)
        if isinstance(predicted_val, bool):
            predicted_val = 1 if predicted_val else 0

        return {
            'prediction': predicted_val,
            'confidence': response.get('confidence', None),
            'error': None
        }

    except Exception as e:
        return {'prediction': None, 'confidence': None, 'error': str(e)}



In [None]:
def calculate_variance_confidence(predictions):
    """Calculate confidence from prediction variance"""
    valid_preds = [p for p in predictions if p is not None]
    if not valid_preds:
        return None, 0.0, True

    # Agreement: majority vote strength
    counts = Counter(valid_preds)
    majority_pred = counts.most_common(1)[0][0]
    agreement = counts[majority_pred] / len(valid_preds)

    # Variance-based confidence: high agreement = high confidence
    variance_conf = agreement

    # Flag as hard if agreement < 67% (2 out of 3)
    is_hard = agreement < 0.67

    return majority_pred, variance_conf, is_hard

def process_sample(row, models_to_use):
    """Process one sample through specified models and temperatures"""
    results = {
        'id': row['id'],
        'text': row['text'],
        'language': row['language'],
        'language_name': row['language_name'],
        'true_label': int(row['polarization'])
    }

    all_predictions = []
    all_confidences = []

    # Get predictions for each model at each temperature
    for model_name in models_to_use:
        model_preds = []
        model_confs = []

        for temp in TEMPERATURES:
            pred = call_model_at_temp(row['text'], row['language_name'], model_name, temp)

            # Store individual prediction
            results[f'{model_name}_t{temp}_pred'] = pred['prediction']
            results[f'{model_name}_t{temp}_conf'] = pred['confidence']
            results[f'{model_name}_t{temp}_error'] = pred['error']

            if pred['prediction'] is not None:
                model_preds.append(pred['prediction'])
                all_predictions.append(pred['prediction'])
            if pred['confidence'] is not None:
                model_confs.append(pred['confidence'])
                all_confidences.append(pred['confidence'])

        # Per-model majority and confidence
        maj_pred, maj_conf, is_uncertain = calculate_variance_confidence(model_preds)
        results[f'{model_name}_majority'] = maj_pred
        results[f'{model_name}_var_confidence'] = maj_conf
        results[f'{model_name}_uncertain'] = is_uncertain
        results[f'{model_name}_avg_confidence'] = np.mean(model_confs) if model_confs else None

    # Only calculate ensemble if we have all models
    if len(models_to_use) == 3:
        ensemble_pred, ensemble_conf, is_hard = calculate_variance_confidence(all_predictions)
        results['ensemble_prediction'] = ensemble_pred
        results['ensemble_confidence'] = ensemble_conf
        results['is_hard_negative'] = is_hard
        results['avg_model_confidence'] = np.mean(all_confidences) if all_confidences else None

    return results

def load_full_train_data():
    """Load all training samples"""
    all_data = []
    for lang in LANGUAGES:
        df = pd.read_csv(f"{DATA_DIR}/subtask1/train/{lang}.csv")
        df['language'] = lang
        df['language_name'] = LANGUAGE_NAMES[lang]
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

def load_existing_results(model_name):
    """
    Load existing results from checkpoint or final file
    Returns: (results_df, last_processed_idx)
    """
    output_path = f"{PHASE1_OUTPUT}/{model_name}_predictions.csv"

    # Check for final output first
    if os.path.exists(output_path):
        print(f"Found existing results file: {output_path}")
        df = pd.read_csv(output_path)
        return df, len(df)

    # Look for most recent checkpoint
    checkpoint_files = [f for f in os.listdir(PHASE1_OUTPUT)
                       if f.startswith(f"{model_name}_checkpoint_") and f.endswith('.csv')]

    if checkpoint_files:
        # Get the most recent checkpoint (highest number)
        checkpoint_nums = [int(f.split('_')[-1].replace('.csv', '')) for f in checkpoint_files]
        latest_checkpoint_num = max(checkpoint_nums)
        latest_checkpoint = f"{PHASE1_OUTPUT}/{model_name}_checkpoint_{latest_checkpoint_num}.csv"

        print(f"Found checkpoint: {latest_checkpoint}")
        df = pd.read_csv(latest_checkpoint)
        return df, len(df)

    return None, 0


In [None]:
def load_full_train_data():
    """Load all training samples"""
    all_data = []
    for lang in LANGUAGES:
        df = pd.read_csv(f"{DATA_DIR}/subtask1/train/{lang}.csv")
        df['language'] = lang
        df['language_name'] = LANGUAGE_NAMES[lang]
        all_data.append(df)
    return pd.concat(all_data, ignore_index=True)

In [None]:

def run_model(model_name, sample_size=None, start_idx=0, checkpoint_every=50, force_restart=False):
    """
    Run predictions for a single model across all temperatures

    Args:
        model_name: 'DeepSeek', 'GPT-4o', or 'Qwen2.5'
        sample_size: Number of samples to process (None = all)
        start_idx: Starting index for resuming runs (overridden by existing results)
        checkpoint_every: Save checkpoint every N samples
        force_restart: If True, ignore existing results and start from scratch
    """
    print(f"\n{'='*60}")
    print(f"Starting {model_name}")
    print(f"{'='*60}")

    # Load full data
    df = load_full_train_data()
    total_samples = len(df)

    # Check for existing results unless force_restart
    existing_results = None
    resume_from_idx = start_idx

    if not force_restart:
        existing_results, processed_count = load_existing_results(model_name)
        if existing_results is not None and processed_count > 0:
            resume_from_idx = processed_count
            print(f"Resuming from index {resume_from_idx} ({processed_count} samples already processed)")
            print(f"Progress: {processed_count}/{total_samples} ({processed_count/total_samples*100:.1f}%)")

            # Ask user if they want to continue
            user_input = input(f"\nContinue from index {resume_from_idx}? (y/n): ").strip().lower()
            if user_input != 'y':
                print("Starting fresh...")
                existing_results = None
                resume_from_idx = start_idx

    # Apply sampling if requested
    if sample_size is not None:
        end_idx = min(resume_from_idx + sample_size, total_samples)
        df_to_process = df.iloc[resume_from_idx:end_idx].copy()
    else:
        df_to_process = df.iloc[resume_from_idx:].copy()

    # Reset index for clean processing
    df_to_process = df_to_process.reset_index(drop=True)

    # Initialize results storage
    if existing_results is not None:
        results = existing_results.to_dict('records')
    else:
        results = []

    error_count = 0

    print(f"\nProcessing {len(df_to_process)} samples...")
    print(f"Checkpoints will be saved every {checkpoint_every} samples")

    # Process each sample
    for idx in tqdm(range(len(df_to_process)), desc=f"{model_name}"):
        row = df_to_process.iloc[idx]
        actual_idx = resume_from_idx + idx  # Track actual position in full dataset

        try:
            # Process sample with this model only
            result = process_sample(row, models_to_use=[model_name])
            results.append(result)

            # Track errors
            has_error = any(
                result.get(f'{model_name}_t{temp}_error') is not None
                for temp in TEMPERATURES
            )
            if has_error:
                error_count += 1

        except Exception as e:
            error_count += 1
            print(f"\nError at index {actual_idx}: {str(e)}")
            # Create empty result with error
            results.append({
                'id': row['id'],
                'text': row['text'],
                'language': row['language'],
                'language_name': row['language_name'],
                'true_label': int(row['polarization']),
                f'{model_name}_majority': None,
                'error': str(e)
            })

        # Save checkpoint
        if (idx + 1) % checkpoint_every == 0:
            checkpoint_df = pd.DataFrame(results)
            checkpoint_path = f"{PHASE1_OUTPUT}/{model_name}_checkpoint_{len(results)}.csv"
            checkpoint_df.to_csv(checkpoint_path, index=False)
            print(f"\nCheckpoint saved: {checkpoint_path} ({len(results)}/{total_samples} samples)")

    # Convert to DataFrame
    results_df = pd.DataFrame(results)

    # Save final results
    output_path = f"{PHASE1_OUTPUT}/{model_name}_predictions.csv"
    results_df.to_csv(output_path, index=False)

    # Clean up old checkpoints
    checkpoint_files = [f for f in os.listdir(PHASE1_OUTPUT)
                       if f.startswith(f"{model_name}_checkpoint_") and f.endswith('.csv')]
    for checkpoint_file in checkpoint_files:
        try:
            os.remove(f"{PHASE1_OUTPUT}/{checkpoint_file}")
            print(f"Removed old checkpoint: {checkpoint_file}")
        except:
            pass

    # Calculate metrics
    print(f"\n{'='*60}")
    print(f"{model_name} - Results Summary")
    print(f"{'='*60}")

    # Get valid predictions
    valid_mask = results_df[f'{model_name}_majority'].notna()
    valid_results = results_df[valid_mask]

    if len(valid_results) > 0:
        y_true = valid_results['true_label']
        y_pred = valid_results[f'{model_name}_majority']

        accuracy = accuracy_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)

        print(f"Total samples: {len(results_df)}")
        print(f"Valid predictions: {len(valid_results)} ({len(valid_results)/len(results_df)*100:.1f}%)")
        print(f"Errors: {error_count}")
        print(f"\nPerformance Metrics:")
        print(f"  Accuracy:  {accuracy:.4f}")
        print(f"  F1 Score:  {f1:.4f}")
        print(f"  Precision: {precision:.4f}")
        print(f"  Recall:    {recall:.4f}")

        # Per-temperature metrics
        print(f"\nPer-Temperature Performance:")
        for temp in TEMPERATURES:
            temp_valid = results_df[f'{model_name}_t{temp}_pred'].notna()
            if temp_valid.sum() > 0:
                temp_results = results_df[temp_valid]
                temp_acc = accuracy_score(
                    temp_results['true_label'],
                    temp_results[f'{model_name}_t{temp}_pred']
                )
                temp_f1 = f1_score(
                    temp_results['true_label'],
                    temp_results[f'{model_name}_t{temp}_pred']
                )
                print(f"  T={temp}: Acc={temp_acc:.4f}, F1={temp_f1:.4f}")

        # Confidence analysis
        avg_var_conf = valid_results[f'{model_name}_var_confidence'].mean()
        avg_model_conf = valid_results[f'{model_name}_avg_confidence'].mean()
        uncertain_pct = valid_results[f'{model_name}_uncertain'].sum() / len(valid_results) * 100

        print(f"\nConfidence Analysis:")
        print(f"  Avg variance confidence: {avg_var_conf:.4f}")
        print(f"  Avg model confidence: {avg_model_conf:.4f}")
        print(f"  Uncertain samples: {uncertain_pct:.1f}%")

        # Per-language breakdown
        print(f"\nPer-Language Performance:")
        for lang in valid_results['language'].unique():
            lang_data = valid_results[valid_results['language'] == lang]
            if len(lang_data) > 0:
                lang_acc = accuracy_score(
                    lang_data['true_label'],
                    lang_data[f'{model_name}_majority']
                )
                lang_f1 = f1_score(
                    lang_data['true_label'],
                    lang_data[f'{model_name}_majority']
                )
                print(f"  {lang}: N={len(lang_data)}, Acc={lang_acc:.4f}, F1={lang_f1:.4f}")
    else:
        print(f"WARNING: No valid predictions obtained!")
        print(f"Total samples: {len(results_df)}")
        print(f"Errors: {error_count}")

    print(f"\nResults saved to: {output_path}")
    print(f"{'='*60}\n")

    return results_df

In [2]:
run_model('DeepSeek')
# run_model('GPT-4o')
# run_model('Qwen2.5')
# combine_results()

NameError: name 'run_model' is not defined

In [None]:
print()


