In [None]:
!pip install -q anthropic openai pandas numpy tqdm datasets scikit-learn
!pip install google-generativeai together -q
!pip install cohere tenacity -q


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m357.5/357.5 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.7/46.7 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.3/303.3 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.5/3.5 MB[0m [31m53.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import json
import pandas as pd
from tqdm import tqdm
import time
from google.colab import drive
from datetime import datetime
import anthropic
import openai
from openai import OpenAI
import google.generativeai as genai
import cohere
from together import Together
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tenacity import retry, wait_exponential, stop_after_attempt

In [None]:
drive.mount('/content/drive')

PROJECT_ROOT = "/content/drive/MyDrive/PhD/Courses/year_2/text_analytics/POLAR_SemEval2026"
DATA_DIR = f"{PROJECT_ROOT}/data"
OUTPUT_DIR = f"{PROJECT_ROOT}/tier1_output"
PHASE05_OUTPUT = f"{OUTPUT_DIR}/phase05_model_selection"
os.makedirs(PHASE05_OUTPUT, exist_ok=True)

Mounted at /content/drive


In [None]:
# API keys
os.environ["ANTHROPIC_API_KEY"] = xxxxx
os.environ["OPENAI_API_KEY"] = xxxx
os.environ["DEEPSEEK_API_KEY"] = xxx
os.environ["COHERE_API_KEY"] = xxx
os.environ["TOGETHER_API_KEY"] = xxx
# os.environ["GEMINI_API_KEY"] = xxxx

In [None]:
# Initialize clients
claude_client = anthropic.Anthropic()
openai_client = OpenAI()
deepseek_client = OpenAI(api_key=os.environ["DEEPSEEK_API_KEY"], base_url="https://api.deepseek.com")
# genai.configure(api_key=os.environ["GEMINI_API_KEY"])
gemini_client = gemini_client = genai.GenerativeModel('gemini-1.5-pro')
cohere_client = cohere.Client(os.environ["COHERE_API_KEY"])
together_client = Together()

In [None]:
# Languages
LANGUAGES = ['zho', 'urd', 'eng', 'spa', 'deu']
LANGUAGE_NAMES = {'zho': 'Chinese', 'urd': 'Urdu', 'eng': 'English', 'spa': 'Spanish', 'deu': 'German'}

# Prompt
SYSTEM_PROMPT = """You are an expert at detecting online polarization.

Polarization means: sharp division into opposing groups with hostility, us-vs-them framing, or moral condemnation.

Respond ONLY with JSON:
{
  "is_polarized": true/false,
  "confidence": 0.0-1.0,
  "reasoning": "brief explanation"
}"""


In [None]:
# Model functions
def call_claude(text, language_name):
    try:
        message = claude_client.messages.create(
            model="claude-sonnet-4-20250514",
            max_tokens=200,
            temperature=0,
            system=SYSTEM_PROMPT,
            messages=[{"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}]
        )
        return json.loads(message.content[0].text)
    except Exception as e:
        return {"error": str(e)}




def call_gpt4o(text, language_name):
    try:
        response = openai_client.chat.completions.create(
            model="gpt-4o",
            temperature=0,
            response_format={"type": "json_object"},
            timeout=30.0,  # Add explicit timeout
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
            ]
        )

        content = response.choices[0].message.content

        if not content or content.strip() == "":
            return {"error": f"Empty response, finish_reason: {response.choices[0].finish_reason}"}

        return json.loads(content.strip())

    except Exception as e:
        return {"error": f"Exception: {str(e)}"}


# GPT-5 nano function
def call_gpt5_nano(text, language_name, temperature=0):
    try:
        response = openai_client.chat.completions.create(
            model="gpt-5-nano",
            temperature=temperature,
            timeout=30.0,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
            ]
        )
        response_text = response.choices[0].message.content.strip()
        if response_text.startswith('```'):
            lines = response_text.split('\n')[1:]
            if lines[-1].strip() == '```':
                lines = lines[:-1]
            response_text = '\n'.join(lines).strip()
        return json.loads(response_text)
    except Exception as e:
        return {"error": str(e)}


def call_deepseek(text, language_name):
    try:
        response = deepseek_client.chat.completions.create(
            model="deepseek-chat",
            temperature=0,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
            ]
        )

        response_text = response.choices[0].message.content.strip()

        # Handle markdown code blocks
        if response_text.startswith('```'):
            lines = response_text.split('\n')
            # Remove first line (```json or ```)
            lines = lines[1:]
            # Remove last line (```)
            if lines[-1].strip() == '```':
                lines = lines[:-1]
            response_text = '\n'.join(lines).strip()

        return json.loads(response_text)

    except json.JSONDecodeError as e:
        return {"error": f"JSON parse error: {str(e)}"}
    except Exception as e:
        return {"error": str(e)}

# def call_gemini(text, language_name):
#     try:
#         prompt = f"{SYSTEM_PROMPT}\n\nLanguage: {language_name}\n\nText: {text}\n\nAnalyze:"
#         response = gemini_client.generate_content(prompt, generation_config={'temperature': 0})

#         response_text = response.text.strip()

#         # Handle markdown code blocks (same as DeepSeek)
#         if response_text.startswith('```'):
#             lines = response_text.split('\n')
#             # Remove first line (```json or ```)
#             lines = lines[1:]
#             # Remove last line (```)
#             if lines[-1].strip() == '```':
#                 lines = lines[:-1]
#             response_text = '\n'.join(lines).strip()

#         return json.loads(response_text)

#     except json.JSONDecodeError as e:
#         return {"error": f"JSON parse error: {str(e)}"}
#     except Exception as e:
#         return {"error": str(e)}

def call_cohere(text, language_name):
    try:
        response = cohere_client.chat(
            message=f"Language: {language_name}\n\nText: {text}\n\nAnalyze:",
            model="command-r-plus-08-2024",
            temperature=0,
            preamble=SYSTEM_PROMPT
        )

        content = response.text

        # Remove markdown code blocks
        if "```json" in content:
            content = content.split("```json")[1].split("```")[0]
        elif "```" in content:
            content = content.split("```")[1].split("```")[0]

        content = content.strip()

        return json.loads(content)

    except json.JSONDecodeError as e:
        print(f"JSON parsing error: {e}")
        print(f"Response text: {response.text[:200]}")
        return {"error": f"JSON parse error: {str(e)}"}
    except Exception as e:
        return {"error": str(e)}


def call_qwen(text, language_name):
    try:
        response = together_client.chat.completions.create(
            model="Qwen/Qwen2.5-72B-Instruct-Turbo",
            temperature=0,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
            ]
        )
        return json.loads(response.choices[0].message.content)
    except Exception as e:
        return {"error": str(e)}


In [None]:
# Load data
def load_train_sample(lang, n=100):
    file_path = f"{DATA_DIR}/subtask1/train/{lang}.csv"
    df = pd.read_csv(file_path)
    df['language'] = lang
    df['language_name'] = LANGUAGE_NAMES[lang]
    return df.sample(n=min(n, len(df)), random_state=42)

# Evaluation
def evaluate_model(df, model_func, model_name):
    results = []
    for idx, row in tqdm(df.iterrows(), total=len(df), desc=model_name):
        pred = model_func(row['text'], row['language_name'])
        predicted_val = pred.get('is_polarized', None)
        if isinstance(predicted_val, bool):
            predicted_val = 1 if predicted_val else 0

        results.append({
            'id': row['id'],
            'language': row['language'],
            'language_name': row['language_name'],
            'true_label': int(row['polarization']),
            'predicted': predicted_val,
            'confidence': pred.get('confidence', None),
            'error': pred.get('error', None)
        })
        time.sleep(0.05)

    return pd.DataFrame(results)

## Phase 0.5

In [None]:
def calculate_f1(results_df):
    valid = results_df[(results_df['error'].isna()) & (results_df['predicted'].notna())]
    if len(valid) == 0:
        return 0.0
    return f1_score(valid['true_label'], valid['predicted'], average='macro')

# Run Phase 0.5
print(" Phase 0.5: Model Selection")
print("=" * 60)

all_results = []

for lang in LANGUAGES:
    print(f"\n Testing {LANGUAGE_NAMES[lang]}...")
    sample = load_train_sample(lang, 100)

    models = {
        #'Gemini': call_gemini,
        'Cohere': call_cohere,
        'Qwen2.5': call_qwen,
        'GPT-4o': call_gpt4o,
        'DeepSeek': call_deepseek,
        'Claude': call_claude
    }

    lang_results = []
    for model_name, model_func in models.items():
        results = evaluate_model(sample, model_func, f"{lang}-{model_name}")
        f1 = calculate_f1(results)

        lang_results.append({
            'language': lang,
            'language_name': LANGUAGE_NAMES[lang],
            'model': model_name,
            'f1_score': f1,
            'n_samples': len(results),
            'errors': results['error'].notna().sum()
        })

        results.to_csv(f"{PHASE05_OUTPUT}/{lang}_{model_name}_results.csv", index=False)
        print(f"  {model_name}: F1={f1:.3f}")

    all_results.extend(lang_results)

In [None]:
# Summary
summary_df = pd.DataFrame(all_results)
summary_df.to_csv(f"{PHASE05_OUTPUT}/model_selection_summary.csv", index=False)

summary_df

Unnamed: 0,language,language_name,model,f1_score,n_samples,errors
0,zho,Chinese,Cohere,0.86936,100,0
1,zho,Chinese,Qwen2.5,0.855312,100,0
2,zho,Chinese,GPT-4o,0.907928,100,0
3,zho,Chinese,DeepSeek,0.899356,100,0
4,zho,Chinese,Claude,0.866131,100,0
5,urd,Urdu,Cohere,0.679117,100,0
6,urd,Urdu,Qwen2.5,0.655331,100,0
7,urd,Urdu,GPT-4o,0.668475,100,0
8,urd,Urdu,DeepSeek,0.707968,100,0
9,urd,Urdu,Claude,0.751984,100,0


In [None]:
# Top 3 per language
print("TOP 3 MODELS PER LANGUAGE:")

top3_per_lang = {}
for lang in LANGUAGES:
    lang_data = summary_df[summary_df['language'] == lang].sort_values('f1_score', ascending=False)
    top3 = lang_data.head(3)
    top3_per_lang[lang] = top3['model'].tolist()

    print(f"\n{LANGUAGE_NAMES[lang]}:")
    for idx, row in top3.iterrows():
        print(f"  {row['model']}: {row['f1_score']:.3f}")

# Save top 3
# with open(f"{PHASE05_OUTPUT}/top3_models_per_language.json", 'w') as f:
#     json.dump(top3_per_lang, f, indent=2)

TOP 3 MODELS PER LANGUAGE:

Chinese:
  GPT-4o: 0.908
  DeepSeek: 0.899
  Cohere: 0.869

Urdu:
  Claude: 0.752
  DeepSeek: 0.708
  Cohere: 0.679

English:
  Claude: 0.856
  Qwen2.5: 0.776
  DeepSeek: 0.754

Spanish:
  Claude: 0.770
  Qwen2.5: 0.750
  GPT-4o: 0.749

German:
  Claude: 0.727
  Qwen2.5: 0.694
  DeepSeek: 0.690


In [None]:
# GPT-5 nano function
def call_gpt5_nano(text, language_name, temperature=0):
    try:
        response = openai_client.chat.completions.create(
            model="gpt-5-nano",
            temperature=temperature,
            timeout=30.0,
            messages=[
                {"role": "system", "content": SYSTEM_PROMPT},
                {"role": "user", "content": f"Language: {language_name}\n\nText: {text}\n\nAnalyze:"}
            ]
        )
        response_text = response.choices[0].message.content.strip()
        if response_text.startswith('```'):
            lines = response_text.split('\n')[1:]
            if lines[-1].strip() == '```':
                lines = lines[:-1]
            response_text = '\n'.join(lines).strip()
        return json.loads(response_text)
    except Exception as e:
        return {"error": str(e)}

# Test GPT-5 nano
gpt5_results = {}
for lang in ['zho', 'urd', 'eng', 'spa', 'deu']:
    lang_df = train_df[train_df['language'] == lang].head(100)
    preds, trues = [], []

    for idx, row in tqdm(lang_df.iterrows(), total=100, desc=f"GPT-5 nano {lang}"):
        pred = call_gpt5_nano(row['text'], row['language_name'], temperature=0)
        predicted_val = pred.get('is_polarized', None)
        if isinstance(predicted_val, bool):
            predicted_val = 1 if predicted_val else 0
        if predicted_val is not None:
            preds.append(predicted_val)
            trues.append(int(row['polarization']))
        time.sleep(0.1)

    f1 = f1_score(trues, preds, average='macro')
    gpt5_results[lang] = f1
    print(f"{lang}: {f1:.3f}")

avg_f1 = np.mean(list(gpt5_results.values()))
print(f"\n{'='*50}")
print(f"GPT-5 nano Average F1: {avg_f1:.3f}")
print(f"Qwen2.5 Average F1:    0.746")
print(f"{'='*50}")
if avg_f1 > 0.746:
    print("✅ REPLACE Qwen2.5 with GPT-5 nano")
else:
    print("❌ KEEP Qwen2.5")

NameError: name 'train_df' is not defined

### Phase 1