In [5]:
from dataclasses import dataclass, asdict
from typing import List, Dict, Optional, Tuple
import re
import math
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [8]:
# -----------------------------
# Helpers: basic text utilities
# -----------------------------

import re  # make sure this is imported somewhere

SENTENCE_SPLIT_REGEX = re.compile(r'[.!?]+')
WORD_REGEX = re.compile(r"\b[\w'-]+\b", re.UNICODE)

# Simple list of cohesion connectors
CONNECTORS = [
    # Common general connectors
    "however",
    "therefore",
    "indeed",
    "for this reason",
    "for these reasons",
    "to this extent",
    "to this end",
    "in addition",
    "moreover",
    "furthermore",
    "on the other hand",
    "for example",
    "for instance",
    "as a result",
    "consequently",

    # Simple ones already covered in the base list (kept for completeness)
    "and",
    "but",
    "because",
    "so",
    "although",
]

# Ground-truth reference metadata for your manual-search / web-scraping layer.
"""# Use a single, consistent definition of reference_db
reference_db = [
    ReferenceMetadata(
        id="mono-lingual",        # matches \cite{mono-lingual}
        title="(Fill real title for mono-lingual paper here)",
        source_type="conference",  # or "journal" / "arxiv" / "other"
        year=2023,
    ),
    ReferenceMetadata(
        id="Kang_2025",           # matches \cite{Kang_2025}
        title="(Fill real title for Kang 2025 paper here)",
        source_type="journal",
        year=2025,
    ),
    ReferenceMetadata(
        id="[1]",
        title="Attention Is All You Need",
        source_type="conference",
        year=2017,
    ),
    ReferenceMetadata(
        id="Smith2020",
        title="A Journal Paper on Transformers",
        source_type="journal",
        year=2020,
    ),
]"""

# Simple list of pattern phrases indicating "quasi-definitions"
QUASI_DEF_PATTERNS = [
    r"\bis defined as\b",
    r"\bcan be defined as\b",
    r"\bis called\b",
    r"\brefers to\b",
    r"\bis the process of\b",
    r"\bis a type of\b",
]

# Simple syllable counter (heuristic)
VOWELS = "aeiouy"

JARGON_LIST = [
    # General field
    "natural language processing",
    "nlp",
    "large language model",
    "large language models",
    "language model",
    "language models",
    "llm",
    "llms",
    "ai systems",
    "artificial intelligence",
    "corpora",
    "corpus",
    "pre-training",
    "pretrained",
    "pre-training corpora",
    "training corpora",

    # Evaluation / benchmarking
    "benchmark",
    "benchmarks",
    "testing dataset",
    "testing datasets",
    "questionnaire",
    "questionnaires",
    "reasoning capabilities",
    "reasoning tasks",
    "chain-of-thought",
    "cot",
    "token",
    "tokens",
    "efficiency",

    # Multilingual / monolingual
    "multilingual",
    "multi-lingual",
    "monolingual",
    "language choice",
    "language-specific properties",
    "lexical gap",
    "lexical gaps",
    "semantic distinctions",

    # Research / methodology
    "research questions",
    "research question",
    "reliability",
    "model performance",
    "model behaviour",
    "reasoning abilities",
]

  """# Use a single, consistent definition of reference_db


In [9]:
import json
import os


def count_syllables_in_word(word: str) -> int:
    word = word.lower()
    word = re.sub(r'[^a-z]', '', word)
    if not word:
        return 0
    syllables = 0
    prev_is_vowel = False
    for ch in word:
        is_vowel = ch in VOWELS
        if is_vowel and not prev_is_vowel:
            syllables += 1
        prev_is_vowel = is_vowel
    # Remove silent 'e' at end if there is more than one syllable
    if word.endswith("e") and syllables > 1:
        syllables -= 1
    return max(syllables, 1)

def split_sentences(text: str) -> List[str]:
    sentences = [s.strip() for s in SENTENCE_SPLIT_REGEX.split(text) if s.strip()]
    return sentences

def tokenize_words(text: str) -> List[str]:
    return WORD_REGEX.findall(text)


# -----------------------------
# Data models
# -----------------------------

@dataclass
class LinguisticClarityScores:
    words_per_sentence: float
    jargon_per_sentence: float
    flesch_reading_ease: float
    connectors_per_sentence: float

@dataclass
class ScientificAccuracyScores:
    quasi_definitions_per_answer: int
    bias_markers_per_answer: int
    cosine_similarity_q_a: float

@dataclass
class ReferenceMetadata:
    """Ground-truth reference info from manual search / web-scraping."""
    id: str                 # e.g. "[1]" or "Smith2020"
    title: str
    source_type: str        # "journal", "conference", "arxiv", "other"
    year: Optional[int]     # publication year

@dataclass
class ReferenceScores:
    hallucinated_citations_per_answer: int
    source_quality_score: float
    recency_score: float

@dataclass
class RubricResult:
    linguistic_clarity: LinguisticClarityScores
    scientific_accuracy: ScientificAccuracyScores
    references: ReferenceScores

    def as_dict(self):
        return {
            "linguistic_clarity": asdict(self.linguistic_clarity),
            "scientific_accuracy": asdict(self.scientific_accuracy),
            "references": asdict(self.references),
        }


# -----------------------------
# Rubric evaluator
# -----------------------------

class RubricEvaluator:
    def __init__(
        self,
        jargon_list: Optional[List[str]] = None,
        connectors: Optional[List[str]] = None,
        reference_db: Optional[List[ReferenceMetadata]] = None,
    ):
        """
        jargon_list: list of domain-specific technical terms (lowercase).
        connectors: list of cohesion connectors (lowercase).
        reference_db: ground-truth references obtained from manual search / web-scraping.
        """
        self.jargon_list = set((j.lower() for j in (jargon_list or [])))
        self.connectors = [c.lower() for c in (connectors or CONNECTORS)]
        self.reference_db = reference_db or []
        self.reference_ids = {ref.id for ref in self.reference_db}

    # -------------------------
    # Linguistic clarity
    # -------------------------

    def evaluate_linguistic_clarity(self, answer: str) -> LinguisticClarityScores:
        sentences = split_sentences(answer)
        num_sentences = max(len(sentences), 1)
        words = tokenize_words(answer)
        num_words = len(words)

        # Complexity: words per sentence
        words_per_sentence = num_words / num_sentences

        # Jargon words per sentence
        jargon_count = 0
        if self.jargon_list:
            for w in words:
                if w.lower() in self.jargon_list:
                    jargon_count += 1
        jargon_per_sentence = jargon_count / num_sentences

        # Flesch Reading Ease
        syllables = sum(count_syllables_in_word(w) for w in words) or 1
        words_per_sentence_for_flesch = num_words / num_sentences
        syllables_per_word = syllables / max(num_words, 1)
        flesch = 206.835 - 1.015 * words_per_sentence_for_flesch - 84.6 * syllables_per_word

        # Cohesion: average connectors per sentence
        connectors_count = 0
        lower_answer = answer.lower()
        for c in self.connectors:
            # count occurrences of connector as a word / phrase
            connectors_count += len(re.findall(r"\b" + re.escape(c) + r"\b", lower_answer))
        connectors_per_sentence = connectors_count / num_sentences

        return LinguisticClarityScores(
            words_per_sentence=words_per_sentence,
            jargon_per_sentence=jargon_per_sentence,
            flesch_reading_ease=flesch,
            connectors_per_sentence=connectors_per_sentence,
        )

    # -------------------------
    # Scientific accuracy
    # -------------------------

    def _count_quasi_definitions(self, answer: str) -> int:
        count = 0
        lower = answer.lower()
        for pattern in QUASI_DEF_PATTERNS:
            count += len(re.findall(pattern, lower))
        return count

    def _count_bias_markers(self, question: str, answer: str) -> int:
        """
        Heuristic: we treat 'bias' as copying loaded / subjective language
        from the question into the answer. This is just a proxy!
        """
        # Subjective / loaded words (extend as needed)
        loaded_terms = [
            "obviously", "clearly", "of course", "everyone knows",
            "always", "never", "terrible", "disaster", "perfect",
        ]

        q_words = set(w.lower() for w in tokenize_words(question))
        a_words = [w.lower() for w in tokenize_words(answer)]

        # Count loaded terms that appear in both question and answer
        count = 0
        for w in a_words:
            if w in loaded_terms and w in q_words:
                count += 1
        return count

    def _cosine_similarity_q_a(self, question: str, answer: str) -> float:
        """
        Accuracy via cosine similarity between TF-IDF representations of question and answer.
        This is *semantic-ish* but not using any LLM.
        """
        texts = [question, answer]
        vectorizer = TfidfVectorizer()
        tfidf = vectorizer.fit_transform(texts)
        sim = cosine_similarity(tfidf[0:1], tfidf[1:2])[0, 0]
        return float(sim)

    def evaluate_scientific_accuracy(self, question: str, answer: str) -> ScientificAccuracyScores:
        quasi_defs = self._count_quasi_definitions(answer)
        bias_markers = self._count_bias_markers(question, answer)
        cos_sim = self._cosine_similarity_q_a(question, answer)
        return ScientificAccuracyScores(
            quasi_definitions_per_answer=quasi_defs,
            bias_markers_per_answer=bias_markers,
            cosine_similarity_q_a=cos_sim,
        )

    # -------------------------
    # References
    # -------------------------

    def _extract_citation_ids(self, answer: str) -> List[str]:
        """
        Very simple citation patterns:
        - [1], [2], ...
        - (Smith, 2020) -> we map to "Smith2020"
        You can extend this depending on your citation style.
        """
        citation_ids = []

        # Numeric style: [1], [2]
        for cid in re.findall(r"\[(\d+)\]", answer):
            citation_ids.append(f"[{cid}]")

        # Author-year: (Smith, 2020)
        for auth, year in re.findall(r"\(([A-Z][A-Za-z]+),\s*(\d{4})\)", answer):
            citation_ids.append(f"{auth}{year}")

        return citation_ids

    def _compute_source_quality_score(self, used_ids: List[str]) -> float:
        """
        Journal >= Conference >= Arxiv >= Other
        We compute an average quality score over all matched references.
        """
        quality_map = {
            "journal": 3,
            "conference": 2,
            "arxiv": 1,
            "other": 0,
        }
        scores = []
        for cid in used_ids:
            ref = next((r for r in self.reference_db if r.id == cid), None)
            if ref:
                scores.append(quality_map.get(ref.source_type.lower(), 0))
        if not scores:
            return 0.0
        return sum(scores) / len(scores)

    def _compute_recency_score(self, used_ids: List[str]) -> float:
        """
        Higher is 'more recent on average'.
        We convert publication year into a normalized score [0,1] over the observed range in reference_db.
        """
        years_all = [r.year for r in self.reference_db if r.year is not None]
        if not years_all:
            return 0.0
        min_year, max_year = min(years_all), max(years_all)
        if min_year == max_year:
            return 1.0

        years_used = []
        for cid in used_ids:
            ref = next((r for r in self.reference_db if r.id == cid and r.year is not None), None)
            if ref:
                years_used.append(ref.year)

        if not years_used:
            return 0.0

        # Normalize: (year - min_year) / (max_year - min_year)
        norm_scores = [(y - min_year) / (max_year - min_year) for y in years_used]
        return sum(norm_scores) / len(norm_scores)

    def evaluate_references(self, answer: str) -> ReferenceScores:
        used_cids = self._extract_citation_ids(answer)

        hallucinated = 0
        for cid in used_cids:
            if cid not in self.reference_ids:
                hallucinated += 1

        source_quality = self._compute_source_quality_score(used_cids)
        recency_score = self._compute_recency_score(used_cids)

        return ReferenceScores(
            hallucinated_citations_per_answer=hallucinated,
            source_quality_score=source_quality,
            recency_score=recency_score,
        )

    # -------------------------
    # Full rubric evaluation
    # -------------------------

    def evaluate(
        self,
        question: str,
        answer: str,
    ) -> RubricResult:
        """
        Main entry point.
        """
        ling = self.evaluate_linguistic_clarity(answer)
        sci = self.evaluate_scientific_accuracy(question, answer)
        refs = self.evaluate_references(answer)
        return RubricResult(
            linguistic_clarity=ling,
            scientific_accuracy=sci,
            references=refs,
        )



In [10]:
from pathlib import Path

RQ1_QUESTION = (
    "How does lexical gap between languages impact the reasoning abilities of models?"
)

RQ2_QUESTION = (
    "Do monolingual models perform differently on reasoning tasks depending on the "
    "language in which they are trained? And does it affect their efficiency?"
)

# --------------------------------------------------
# 2. File mapping: which model answer is in which file
# --------------------------------------------------

BASE_DIR = Path.cwd()  # change this to Path.cwd() / "MONOLINGUAL_LLMS" if needed

print("Using BASE_DIR:", BASE_DIR)

ANSWER_FILES = {
    "RQ1": {
        "gpt-5.1": "gpt-5.1(RQ1).txt",
        "deepseek-R1": "deepseek-R1(RQ1).txt",
    },
    "RQ2": {
        "gpt-5.1": "gpt-5.1(RQ2).txt",
        "deepseek-R1": "deepseek-R1(RQ2).txt",
    },
}

RQ_TEXT = {
    "RQ1": RQ1_QUESTION,
    "RQ2": RQ2_QUESTION,
}

# --------------------------------------------------
# 3. Instantiate evaluator for this project context
#    (assumes JARGON_LIST, CONNECTORS, reference_db, RubricEvaluator already defined)
# --------------------------------------------------

evaluator = RubricEvaluator(
    jargon_list=JARGON_LIST,
    connectors=CONNECTORS,
    #reference_db=reference_db,
)

# --------------------------------------------------
# 4. Helper to safely read text files
# --------------------------------------------------

def read_answer_file(relative_path: str) -> str:
    full_path = BASE_DIR / relative_path
    print(f"Reading: {full_path}")  # debug print
    if not full_path.exists():
        raise FileNotFoundError(f"Answer file not found: {full_path}")
    with open(full_path, "r", encoding="utf-8") as f:
        return f.read()

# --------------------------------------------------
# 5. Run evaluation
# --------------------------------------------------

def evaluate_all_answers():
    results = {}

    for rq_id, models in ANSWER_FILES.items():
        question_text = RQ_TEXT[rq_id]
        results[rq_id] = {}

        for model_name, rel_path in models.items():
            answer_text = read_answer_file(rel_path)

            rubric_result = evaluator.evaluate(
                question=question_text,
                answer=answer_text,
            )

            results[rq_id][model_name] = rubric_result.as_dict()

            print("=" * 70)
            print(f"{rq_id} – {model_name}")
            print(f"Question: {question_text}")
            print(f"File: {rel_path}")
            print("- Rubric scores:")
            print(json.dumps(rubric_result.as_dict(), indent=2))

    return results

Using BASE_DIR: /Users/swetangkrishna/Desktop/monolingual_llms


In [12]:
if __name__ == "__main__":
    all_results = evaluate_all_answers()
    with open(BASE_DIR / "rubric_results_RQ1_RQ2.json", "w", encoding="utf-8") as f:
        json.dump(all_results, f, indent=2, ensure_ascii=False)
    print("\nSaved all rubric results to", BASE_DIR / "rubric_results_RQ1_RQ2.json")

Reading: /Users/swetangkrishna/Desktop/monolingual_llms/gpt-5.1(RQ1).txt
RQ1 – gpt-5.1
Question: How does lexical gap between languages impact the reasoning abilities of models?
File: gpt-5.1(RQ1).txt
- Rubric scores:
{
  "linguistic_clarity": {
    "words_per_sentence": 9.987804878048781,
    "jargon_per_sentence": 0.2926829268292683,
    "flesch_reading_ease": 28.478696730099188,
    "connectors_per_sentence": 0.39634146341463417
  },
  "scientific_accuracy": {
    "quasi_definitions_per_answer": 0,
    "bias_markers_per_answer": 0,
    "cosine_similarity_q_a": 0.2951676132706148
  },
  "references": {
    "hallucinated_citations_per_answer": 0,
    "source_quality_score": 0.0,
    "recency_score": 0.0
  }
}
Reading: /Users/swetangkrishna/Desktop/monolingual_llms/deepseek-R1(RQ1).txt
RQ1 – deepseek-R1
Question: How does lexical gap between languages impact the reasoning abilities of models?
File: deepseek-R1(RQ1).txt
- Rubric scores:
{
  "linguistic_clarity": {
    "words_per_sentenc