# Import gemini

Link to the documentation : https://ai.google.dev/gemini-api/docs

In [1]:
!pip install google-generativeai --quiet

[33m  DEPRECATION: Building 'google-api-python-client' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'google-api-python-client'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

In [None]:
import google.generativeai as genai

# Test

genai.configure(api_key="")
model = genai.GenerativeModel("gemini-2.0-flash-lite")  # or "gemini-1.5-pro"

response = model.generate_content("Explain how AI works in a few words")
print(response.text)

# Criteria

Here are the criteria for the attribution of the score to corrected OCRed texts :

| **Score**         | **Criteria**                                                                                                                                                                    |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **1 - Very Poor** | The text remains largely unreadable. Numerous errors persist (spelling, grammar, punctuation). The overall meaning is lost or extremely unclear. Little to no improvement made. |
| **2 - Poor**      | Slightly more readable than the original, but many errors remain. Several sentences are incorrect or ambiguous. Words are still distorted or missing.                           |
| **3 - Fair**      | Most obvious errors have been corrected. The text is generally understandable, but noticeable mistakes and awkward phrasing remain. The flow may be choppy.                     |
| **4 - Good**      | The text is readable and coherent. Only minor errors remain. There’s overall consistency, though a few syntax or word choice issues may still be present.                       |
| **5 - Excellent** | The text is fully corrected: no detectable mistakes, perfect grammar, punctuation, and syntax. The result is fluent, natural, and faithful to the original content.             |


# Import data

## Clean data

In [2]:
import json
import os

extract_dir = os.path.expanduser("~/work/MNLP-project-2/data/eng")

clean_path = os.path.join(extract_dir, "the_vampyre_clean.json")

with open(clean_path, "r", encoding="utf-8") as f:
    clean_data = json.load(f)

In [3]:
def concat_values_dict(d):
    """
    Concat values of a dict, seperating each element with '\n'

    Args:
        d (dict): Dictionnary

    Returns:
        str: concatenated text
    """
    return '\n'.join(d.get(str(i), "") for i in range(48))

clean_data_text = concat_values_dict(clean_data)

## Corrected data

In [4]:
import pandas as pd

back_translation_correction = pd.read_csv('back_translation_correction.csv', index_col=0)
back_translation_correction.head()

Unnamed: 0_level_0,original_text,translated_fr_text,back_translated_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,LE VAMPYRE;\nUn conte.\nPar John William Polid...,VAMPYRE;\nA tale.\nBy John William Polidori\nT...
1,THE VAMPYRE;\nA Tale.\nBy Johri William Polido...,LE VAMPYRE;\nUn conte.\nPar Johri William Poli...,VAMPYRE;\nA tale.\nBy Johri William Polidori\n...
2,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,LE VAMPYRE;\nUn conte.\nPar John William Polid...,VAMPYRE;\nA tale.\nBy John William Polidori\nT...
3,THE VAMPYRE;\nA Tale.\nBy John Wjlliam Polidor...,LE VAMPYRE;\nUn conte.\nPar John Wjlliam Polid...,VAMPYRE;\nA tale.\nBy John Wjlliam Polidori\nT...
4,THE VAMPYRE;\nA Tale.\nBy Jahn William PoIjdor...,LE VAMPYRE;\nUn conte.\nPar Jahn William PoIjd...,VAMPYRE;\nA tale.\nBy Jahn William PoIjdori\nT...


We can not put all the translations in the LLM as a judge, so we build some batches :

In [4]:
# Taille des batches
batch_size = 1

# Diviser le DataFrame en batches
batches = [back_translation_correction[i:i + batch_size] for i in range(0, len(back_translation_correction), batch_size)]

In [5]:
batches[0]

Unnamed: 0_level_0,original_text,translated_fr_text,back_translated_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,LE VAMPYRE;\nUn conte.\nPar John William Polid...,VAMPYRE;\nA tale.\nBy John William Polidori\nT...


In [6]:
len(batches[0]['back_translated_text'].to_list()[0])

61452

# Evaluation thanks to LLM as a judge

In [None]:
import pandas as pd
import time
from google.generativeai import GenerativeModel
import google.generativeai as genai

# Build the model Gemini (we take a small model not to be limited by the API)
genai.configure(api_key="")
model = GenerativeModel("gemini-2.0-flash-lite")

# Function that divides text into chunks (we can not give the whole text as input => it's too big)
def chunk_text(text, max_chunk_size=5000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chunk_size
        if end < len(text):
            last_space = text.rfind(" ", start, end)
            if last_space != -1 and last_space > start:
                end = last_space
        chunks.append(text[start:end].strip())
        start = end
    return chunks

# Build the prompt
def build_prompt_score_only(chunk):
    return f"""You are a text quality evaluator. Evaluate the following corrected OCR text based on the detailed criteria below:

1 - Very Poor: The text remains largely unreadable. Numerous errors persist (spelling, grammar, punctuation). The overall meaning is lost or extremely unclear. Little to no improvement made.
2 - Poor: Slightly more readable than the original, but many errors remain. Several sentences are incorrect or ambiguous. Words are still distorted or missing.
3 - Fair: Most obvious errors have been corrected. The text is generally understandable, but noticeable mistakes and awkward phrasing remain. The flow may be choppy.
4 - Good: The text is readable and coherent. Only minor errors remain. There’s overall consistency, though a few syntax or word choice issues may still be present.
5 - Excellent: The text is fully corrected: no detectable mistakes, perfect grammar, punctuation, and syntax. The result is fluent, natural, and faithful to the original content.

Give only the numeric score (1 to 5). No explanation.

Corrected Text:
\"\"\"{chunk}\"\"\""""

# Function that return the evaluation score for every chunks in a text
def evaluate_text(text):
    chunks = chunk_text(text, max_chunk_size=5000)
    scores_for_text = []

    for i, chunk in enumerate(chunks):
        prompt = build_prompt_score_only(chunk)
        try:
            response = model.generate_content(prompt)
            raw_text = response.candidates[0].content.parts[0].text.strip()
            if raw_text in {"1", "2", "3", "4", "5"}:
                score = int(raw_text)
            else:
                score = -1
        except Exception as e:
            print(f"Error on chunk {i}: {e}")
            score = -1

        scores_for_text.append(score)
    return scores_for_text

def main():
    df = pd.read_csv("back_translation_correction.csv")
    all_scores = {}

    for text_index in range(len(df)):
        print(f"Processing text {text_index + 1} / {len(df)}...")
        text = df.loc[text_index, "back_translated_text"]

        scores = evaluate_text(text)
        all_scores[text_index] = scores

        # 1 minute break because of the API
        if text_index < len(df) - 1:
            print("Waiting 60 seconds before next text...")
            time.sleep(60)

    rows = []
    for text_index, scores_list in all_scores.items():
        for chunk_index, score in enumerate(scores_list):
            rows.append({
                "text_index": text_index,
                "chunk_index": chunk_index,
                "score": score
            })

    df_scores = pd.DataFrame(rows)
    df_scores.to_csv("Back_translation_gemini_scores_all_texts.csv", index=False)
    print("Over : Look at your new CSV files :) ")

if __name__ == "__main__":
    main()

Processing text 1 / 48...
Waiting 60 seconds before next text...
