# Import gemini

Link to the documentation : https://ai.google.dev/gemini-api/docs

In [1]:
!pip install google-generativeai --quiet

[33m  DEPRECATION: Building 'google-api-python-client' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'google-api-python-client'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m

In [25]:
import google.generativeai as genai

# Configure the API key
genai.configure(api_key="")

# Use the Gemini model
model = genai.GenerativeModel("gemini-2.0-flash-lite")  # or "gemini-1.5-pro"

response = model.generate_content("Explain how AI works in a few words")
print(response.text)

AI learns and reasons like humans, but using computers.



# Criteria

Here are the criteria for the attribution of the score to corrected OCRed texts :

| **Score**         | **Criteria**                                                                                                                                                                    |
| ----------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| **1 - Very Poor** | The text remains largely unreadable. Numerous errors persist (spelling, grammar, punctuation). The overall meaning is lost or extremely unclear. Little to no improvement made. |
| **2 - Poor**      | Slightly more readable than the original, but many errors remain. Several sentences are incorrect or ambiguous. Words are still distorted or missing.                           |
| **3 - Fair**      | Most obvious errors have been corrected. The text is generally understandable, but noticeable mistakes and awkward phrasing remain. The flow may be choppy.                     |
| **4 - Good**      | The text is readable and coherent. Only minor errors remain. There’s overall consistency, though a few syntax or word choice issues may still be present.                       |
| **5 - Excellent** | The text is fully corrected: no detectable mistakes, perfect grammar, punctuation, and syntax. The result is fluent, natural, and faithful to the original content.             |


# Import data

## Data

In [4]:
import json
import os

# Chemin vers le dossier où les fichiers ont été extraits
extract_dir = os.path.expanduser("~/work/MNLP-project-2/data/eng")

# Chemins complets vers les fichiers JSON
clean_path = os.path.join(extract_dir, "the_vampyre_clean.json")
ocr_path = os.path.join(extract_dir, "the_vampyre_ocr.json")

# Chargement des données JSON
with open(clean_path, "r", encoding="utf-8") as f:
    clean_data = json.load(f)

with open(ocr_path, "r", encoding="utf-8") as f:
    ocr_data = json.load(f)

In [3]:
def concat_values_dict(d):
    """
    Concat values of a dict, seperating each element with '\n'

    Args:
        d (dict): Dictionnary

    Returns:
        str: concatenated text
    """
    return '\n'.join(d.get(str(i), "") for i in range(48))

clean_data_text = concat_values_dict(clean_data)

In [14]:
len(clean_data_text)

62205

## Corrected data

In [9]:
import pandas as pd

T5_correction = pd.read_csv('T5_correction.csv', index_col=0)
T5_correction.head()

Unnamed: 0_level_0,original_text,corrected_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,THE VAMPYRE;\nct: A Tale.\nBy John William Pol...
1,THE VAMPYRE;\nA Tale.\nBy Johri William Polido...,THE VAMPYRE;\nct: A Tale.\nBy John William Pol...
2,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,THE VAMPYRE;\nct: A Tale.\nBy John William Pol...
3,THE VAMPYRE;\nA Tale.\nBy John Wjlliam Polidor...,THE VAMPYRE;\nct: A Tale.\nBy John William Pol...
4,THE VAMPYRE;\nA Tale.\nBy Jahn William PoIjdor...,THE VAMPYRE;\nct: A Tale.\nBy John William Poj...


We can not put all the translations in the LLM as a judge, so we build some batches :

In [20]:
# Taille des batches
batch_size = 1

# Diviser le DataFrame en batches
batches = [T5_correction[i:i + batch_size] for i in range(0, len(T5_correction), batch_size)]

In [21]:
batches[0]

Unnamed: 0_level_0,original_text,corrected_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,THE VAMPYRE;\nA Tale.\nBy John William Polidor...,THE VAMPYRE;\nct: A Tale.\nBy John William Pol...


In [33]:
len(batches[0]['corrected_text'].to_list()[0])

60430

# Evaluation thanks to LLM as a judge

In [None]:
import time
import pandas as pd
from google.generativeai import GenerativeModel
import google.generativeai as genai

# Configure the API key
genai.configure(api_key="AIzaSyDRBH-RcZzo9i4jW08r2rndm7gPJ2BRYg0")

# Initialise Gemini model
model = GenerativeModel("gemini-2.0-flash-lite")

# Divise le texte en morceaux pour ne pas dépasser les limites d'entrée
def chunk_text(text, max_chunk_size=5000):
    chunks = []
    start = 0
    while start < len(text):
        end = start + max_chunk_size
        if end < len(text):
            last_space = text.rfind(" ", start, end)
            if last_space != -1 and last_space > start:
                end = last_space
        chunks.append(text[start:end].strip())
        start = end
    return chunks

# Prompt d'évaluation uniquement avec un score
def build_prompt_score_only(chunk):
    return f"""You are a text quality evaluator. Evaluate the following corrected OCR text based on the detailed criteria below:

1 - Very Poor: The text remains largely unreadable. Numerous errors persist (spelling, grammar, punctuation). The overall meaning is lost or extremely unclear. Little to no improvement made.
2 - Poor: Slightly more readable than the original, but many errors remain. Several sentences are incorrect or ambiguous. Words are still distorted or missing.
3 - Fair: Most obvious errors have been corrected. The text is generally understandable, but noticeable mistakes and awkward phrasing remain. The flow may be choppy.
4 - Good: The text is readable and coherent. Only minor errors remain. There’s overall consistency, though a few syntax or word choice issues may still be present.
5 - Excellent: The text is fully corrected: no detectable mistakes, perfect grammar, punctuation, and syntax. The result is fluent, natural, and faithful to the original content.

Give only the numeric score (1 to 5). No explanation.

Corrected Text:
\"\"\"{chunk}\"\"\""""

# Évalue chaque chunk de texte
def evaluate_text(text):
    chunks = chunk_text(text)
    scores_for_text = []

    for i, chunk in enumerate(chunks):
        prompt = build_prompt_score_only(chunk)
        try:
            response = model.generate_content(prompt)
            raw_text = response.candidates[0].content.parts[0].text.strip()
            score = int(raw_text) if raw_text in {"1", "2", "3", "4", "5"} else -1
        except Exception as e:
            print(f"Error on chunk {i}: {e}")
            score = -1

        scores_for_text.append(score)
    return scores_for_text

# Fonction principale
def main():
    all_scores = {}

    for key in ocr_data:
        print(f"Processing text {key} / {len(ocr_data)}...")
        text = ocr_data[key]
        scores = evaluate_text(text)
        all_scores[key] = scores

        # Pause anti-limite API
        if key != list(ocr_data.keys())[-1]:
            print("Waiting 60 seconds before next text...")
            time.sleep(60)

    # Sauvegarde dans un CSV
    rows = []
    for text_index, scores_list in all_scores.items():
        for chunk_index, score in enumerate(scores_list):
            rows.append({
                "text_index": text_index,
                "chunk_index": chunk_index,
                "score": score
            })

    df_scores = pd.DataFrame(rows)
    df_scores.to_csv("ocr_gemini_scores.csv", index=False)
    print("Finished! CSV saved as 'ocr_gemini_scores.csv'.")

if __name__ == "__main__":
    main()

Processing text 0 / 48...
Waiting 60 seconds before next text...
