In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install openai pandas matplotlib seaborn tqdm



In [None]:
import os

MATERIALS_DIR = "/content/drive/MyDrive/ielts/ielts materials"
RESULTS_DIR = os.path.join(MATERIALS_DIR, "results")
PROMPT_PATH = os.path.join(MATERIALS_DIR, "prompts", "writing", "evaluation.txt")

os.makedirs(RESULTS_DIR, exist_ok=True)

os.environ["OPENROUTER_API_KEY"] = "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"

In [None]:
import openai

client = openai.AsyncOpenAI(
    api_key=os.environ["OPENROUTER_API_KEY"],
    base_url="https://openrouter.ai/api/v1",
    default_headers={
        "HTTP-Referer": "https://github.com/yourusername/ielts_bot",
        "X-Title": "IELTS Evaluation Bot"
    }
)

In [None]:
with open(PROMPT_PATH, "r", encoding="utf-8") as f:
    BASE_PROMPT = f.read()

In [None]:
import re

def parse_ielts_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()

    eval_match = re.search(r"Evaluation:\s*(\d+\.\d+)", content)
    answer_match = re.search(r"Answer text:\s*(.*)", content, re.DOTALL)

    if eval_match and answer_match:
        return float(eval_match.group(1)), answer_match.group(1).strip()
    else:
        return None, None

In [None]:
async def evaluate_answer(task_text, answer_text, model):
    prompt = BASE_PROMPT.replace("[TASK TEXT HERE]", task_text).replace("[STUDENT RESPONSE HERE]", answer_text)

    response = await client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are an IELTS writing examiner."},
            {"role": "user", "content": prompt}
        ],
        temperature=0.7
    )

    content = response.choices[0].message.content
    score_match = re.search(r"overall_band_score:\s*(\d+\.\d+)", content)
    score = float(score_match.group(1)) if score_match else None
    return score, content

In [None]:
import glob
import pandas as pd
from tqdm.asyncio import tqdm as async_tqdm

files = glob.glob(os.path.join(MATERIALS_DIR, "test", "**", "*.txt"), recursive=True)

MODELS = [
    "openai/gpt-4o-mini",
    "google/gemini-2.0-flash-001",
    "microsoft/mai-ds-r1:free",
    "openai/gpt-4o"
]

results = []

def load_task_text(book_folder: str, filename: str):
    match = re.search(r"writing-test(\d+)-task(\d+)", filename)
    if not match:
        return "Unknown task text"

    test_number, task_number = match.group(1), int(match.group(2))

    task_path = os.path.join(
        MATERIALS_DIR,
        f"IELTS {book_folder}",
        f"{test_number} test",
        "Writing",
        f"writing-test{test_number}-task{task_number}.txt"
    )

    try:
        with open(task_path, 'r', encoding='utf-8') as f:
            return f.read().strip()
    except Exception as e:
        print(f"❌ Ошибка при загрузке задания {task_path}: {e}")
        return "Task text not found"

async def process_all():
    for path in async_tqdm(files):
        filename = os.path.basename(path)
        book_folder = "15" if "ielts 15" in path else "14"

        true_score, student_response = parse_ielts_file(path)
        if not true_score:
            continue

        task_text = load_task_text(book_folder, filename)

        for model in MODELS:
            score, raw = await evaluate_answer(task_text, student_response, model)
            results.append({
                "file": filename,
                "book": f"IELTS {book_folder}",
                "true_score": true_score,
                "model": model,
                "score": score,
                "delta": abs(score - true_score) if score else None
            })

import asyncio
await process_all()

100%|██████████| 16/16 [09:05<00:00, 34.08s/it]


In [None]:
df = pd.DataFrame(results)
excel_path = os.path.join(RESULTS_DIR, "writing_eval_results.xlsx")
df.to_excel(excel_path, index=False)
df.head()

Unnamed: 0,file,book,true_score,model,score,delta
0,writing-test1-task1-1.txt,IELTS 15,6.0,openai/gpt-4o-mini,5.0,1.0
1,writing-test1-task1-1.txt,IELTS 15,6.0,google/gemini-2.0-flash-001,,
2,writing-test1-task1-1.txt,IELTS 15,6.0,microsoft/mai-ds-r1:free,5.0,1.0
3,writing-test1-task1-1.txt,IELTS 15,6.0,openai/gpt-4o,5.0,1.0
4,writing-test1-task2-1.txt,IELTS 15,7.0,openai/gpt-4o-mini,5.5,1.5


In [None]:
from google.colab import files
files.download(excel_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import os
import re
import pandas as pd
import openai
from tqdm.notebook import tqdm
import glob

# Промпты
prompt_variants = [
    "Evaluate this IELTS Writing Task response:",
    "Please assess the following IELTS essay:",
    "Grade the following IELTS Writing Task answer:",
    "Review and score the candidate's writing:",
    "How would you rate this IELTS essay?"
]

with open(os.path.join(MATERIALS_DIR, "prompts", "writing", "evaluation.txt"), "r", encoding="utf-8") as f:
    base_prompt = f.read()

files_14 = glob.glob(os.path.join(MATERIALS_DIR, "test", "ielts 14", "writing", "*.txt"))
files_15 = glob.glob(os.path.join(MATERIALS_DIR, "test", "ielts 15", "writing", "*.txt"))
all_files = files_14 + files_15
filtered_files = [f for f in all_files if re.search(r"writing-test\d+-task\d+-1\.txt", f)]

file_metadata = []
for path in filtered_files:
    book = "IELTS 14" if "ielts 14" in path else "IELTS 15"
    file = os.path.basename(path).replace(".txt.txt", "").replace(".txt", "")
    file_metadata.append({"full_path": path, "book": book, "file": file})

def parse_ielts_file(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
    match_eval = re.search(r"Evaluation:\s*(\d+\.\d+)", content)
    match_answer = re.search(r"Answer text:\s*(.*)", content, re.DOTALL)
    return float(match_eval.group(1)), match_answer.group(1).strip() if match_answer else ""

results_robust = []

async def evaluate_prompt_variants():
    for item in tqdm(file_metadata, desc="Evaluating files"):
        true_score, student_response = parse_ielts_file(item["full_path"])
        for i, variant in enumerate(prompt_variants):
            full_prompt = base_prompt.replace("[TASK TEXT HERE]", variant).replace("[STUDENT RESPONSE HERE]", student_response)
            try:
                completion = await client.chat.completions.create(
                    model="openai/gpt-4o-mini",
                    messages=[
                        {"role": "system", "content": "You are an IELTS writing examiner."},
                        {"role": "user", "content": full_prompt}
                    ],
                    temperature=0.7
                )
                response_text = completion.choices[0].message.content
                score_match = re.search(r"overall_band_score:\s*(\d+\.\d+)", response_text)
                score = float(score_match.group(1)) if score_match else None
            except Exception as e:
                score, response_text = None, str(e)

            results_robust.append({
                "file": item["file"],
                "book": item["book"],
                "prompt_variant": i + 1,
                "prompt_text": variant,
                "score": score,
                "true_score": true_score,
                "raw_response": response_text
            })

await evaluate_prompt_variants()

Evaluating files:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
df_robust = pd.DataFrame(results_robust)

prompt_summary = df_robust.groupby("prompt_text").agg(
    mean_score=("score", "mean"),
    mean_std_dev=("score", "std"),
    range_avg=("score", lambda x: x.max() - x.min()),
    cv=("score", lambda x: x.std() / x.mean() if x.mean() else None)
).sort_values("mean_std_dev")

In [None]:
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font

wb = Workbook()
ws1 = wb.active
ws1.title = "Все оценки"

for r in dataframe_to_rows(df_robust, index=False, header=True):
    ws1.append(r)
for cell in ws1[1]:
    cell.font = Font(bold=True)

ws2 = wb.create_sheet("Сводка по промптам")
for r in dataframe_to_rows(prompt_summary.reset_index(), index=False, header=True):
    ws2.append(r)
for cell in ws2[1]:
    cell.font = Font(bold=True)

final_path = "/mnt/data/robustness_gpt4o_mini.xlsx"
wb.save(final_path)
final_path

'/mnt/data/robustness_gpt4o_mini.xlsx'

In [None]:
files.download(final_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
sixth_prompt_text = "REAL_TASK_TEXT"
sixth_prompt_label = "Full task prompt (prod style)"

with open(os.path.join(MATERIALS_DIR, "prompts", "writing", "evaluation.txt"), "r", encoding="utf-8") as f:
    base_prompt = f.read()

# Прогон шестого промпта с реальными заданиями
for item in tqdm(file_metadata, desc="Evaluating 6th prompt"):
    true_score, student_response = parse_ielts_file(item["full_path"])

    task_path = item["full_path"].replace("-1.txt", ".txt").replace("test/", "").replace("writing/", "")
    task_file = os.path.join(MATERIALS_DIR, item["book"], task_path)

    try:
        with open(task_file, "r", encoding="utf-8") as f:
            real_task_text = f.read().strip()
    except:
        real_task_text = "Please evaluate this IELTS Writing Task."

    full_prompt = base_prompt.replace("[TASK TEXT HERE]", real_task_text).replace("[STUDENT RESPONSE HERE]", student_response)

    try:
        completion = await client.chat.completions.create(
            model="openai/gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an IELTS writing examiner."},
                {"role": "user", "content": full_prompt}
            ],
            temperature=0.7
        )
        response_text = completion.choices[0].message.content
        score_match = re.search(r"overall_band_score:\s*(\d+\.\d+)", response_text)
        score = float(score_match.group(1)) if score_match else None
    except Exception as e:
        score, response_text = None, str(e)

    results_robust.append({
        "file": item["file"],
        "book": item["book"],
        "prompt_variant": 6,
        "prompt_text": sixth_prompt_label,
        "score": score,
        "true_score": true_score,
        "raw_response": response_text
    })

Evaluating 6th prompt:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font

df_robust = pd.DataFrame(results_robust)

prompt_summary = df_robust.groupby("prompt_text").agg(
    mean_score=("score", "mean"),
    mean_std_dev=("score", "std"),
    range_avg=("score", lambda x: x.max() - x.min()),
    cv=("score", lambda x: x.std() / x.mean() if x.mean() else None)
).sort_values("mean_std_dev")

wb = Workbook()
ws1 = wb.active
ws1.title = "Все оценки"
for r in dataframe_to_rows(df_robust, index=False, header=True):
    ws1.append(r)
for cell in ws1[1]:
    cell.font = Font(bold=True)

ws2 = wb.create_sheet("Сводка по промптам")
for r in dataframe_to_rows(prompt_summary.reset_index(), index=False, header=True):
    ws2.append(r)
for cell in ws2[1]:
    cell.font = Font(bold=True)

final_path = "/mnt/data/robustness_gpt4o_mini_all6.xlsx"
wb.save(final_path)
final_path


'/mnt/data/robustness_gpt4o_mini_all6.xlsx'

In [None]:
files.download(final_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
good_example_path = "/content/drive/MyDrive/ielts/ielts materials/test/ielts 14/writing/writing-test2-task2-1.txt.txt"
bad_example_path  = "/content/drive/MyDrive/ielts/ielts materials/test/ielts 14/writing/writing-test3-task2-1.txt"

with open(os.path.join(MATERIALS_DIR, "prompts", "writing", "evaluation.txt"), "r", encoding="utf-8") as f:
    base_prompt_template = f.read()

def parse_ielts_file(path):
    with open(path, "r", encoding="utf-8") as f:
        content = f.read()
    match_eval = re.search(r"Evaluation:\s*(\d+\.\d+)", content)
    match_answer = re.search(r"Answer text:\s*(.*)", content, re.DOTALL)
    return float(match_eval.group(1)), match_answer.group(1).strip() if match_answer else ""

good_score, good_text = parse_ielts_file(good_example_path)
bad_score, bad_text = parse_ielts_file(bad_example_path)

example_block = f"""
---EXAMPLES START---

TASK: Write an essay on a global topic.
STUDENT RESPONSE:
{good_text}
FEEDBACK:
overall_band_score: {good_score}

TASK: Write an essay on a global topic.
STUDENT RESPONSE:
{bad_text}
FEEDBACK:
overall_band_score: {bad_score}

---EXAMPLES END---
"""

In [None]:
excluded_files = {
    os.path.basename(good_example_path),
    os.path.basename(bad_example_path)
}

results_fewshot = []

# 4 режима
prompt_configs = [
    ("prod", False),
    ("review", False),
    ("prod", True),
    ("review", True),
]

for variant, use_few in prompt_configs:
    for item in tqdm(file_metadata, desc=f"{variant} / few={use_few}"):
        if item["file"] in excluded_files:
            continue

        true_score, student_response = parse_ielts_file(item["full_path"])

        if variant == "prod":
            task_path = item["full_path"].replace("-1.txt", ".txt").replace("test/", "").replace("writing/", "")
            try:
                task_text = open(os.path.join(MATERIALS_DIR, item["book"], task_path), "r", encoding="utf-8").read().strip()
            except:
                task_text = "Write an essay about a chart or topic."
        else:
            task_text = "Review and score the candidate's writing:"

        full_prompt = base_prompt_template.replace("[TASK TEXT HERE]", task_text).replace("[STUDENT RESPONSE HERE]", student_response)
        if use_few:
            full_prompt = example_block + "\n" + full_prompt

        try:
            completion = await client.chat.completions.create(
                model="openai/gpt-4o-mini",
                messages=[
                    {"role": "system", "content": "You are an IELTS writing examiner."},
                    {"role": "user", "content": full_prompt}
                ],
                temperature=0.7
            )
            response_text = completion.choices[0].message.content
            score_match = re.search(r"overall_band_score:\s*(\d+\.\d+)", response_text)
            score = float(score_match.group(1)) if score_match else None
        except Exception as e:
            score, response_text = None, str(e)

        results_fewshot.append({
            "file": item["file"],
            "book": item["book"],
            "prompt_type": variant,
            "few_shot": use_few,
            "score": score,
            "true_score": true_score,
            "raw_response": response_text
        })

prod / few=False:   0%|          | 0/16 [00:00<?, ?it/s]

review / few=False:   0%|          | 0/16 [00:00<?, ?it/s]

prod / few=True:   0%|          | 0/16 [00:00<?, ?it/s]

review / few=True:   0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
from openpyxl.styles import Font

df_fewshot = pd.DataFrame(results_fewshot)

summary = df_fewshot.groupby(["prompt_type", "few_shot"]).agg(
    mean_score=("score", "mean"),
    std_dev=("score", "std"),
    mae=("score", lambda x: (x - df_fewshot.loc[x.index, "true_score"]).abs().mean()),
    cv=("score", lambda x: x.std() / x.mean() if x.mean() else None),
).reset_index()

wb = Workbook()
ws1 = wb.active
ws1.title = "Все оценки"
for r in dataframe_to_rows(df_fewshot, index=False, header=True):
    ws1.append(r)
for cell in ws1[1]:
    cell.font = Font(bold=True)

ws2 = wb.create_sheet("Сводка")
for r in dataframe_to_rows(summary, index=False, header=True):
    ws2.append(r)
for cell in ws2[1]:
    cell.font = Font(bold=True)

few_path = "/mnt/data/few_shot_results.xlsx"
wb.save(few_path)
few_path

'/mnt/data/few_shot_results.xlsx'

In [None]:
files.download(few_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>