In [None]:
import os
os.environ ["OPENAL_API_KEY"] =""

In [None]:
import json
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema import SystemMessage, HumanMessage
from langchain.agents import AgentExecutor, create_openai_functions_agent

# 1. Trait Persona 생성 Agent
def create_trait_personas(essay, prompt, max_score, min_score, traits=None):
    llm = ChatOpenAI(model="gpt-4o")
    auto_trait_prompt = ChatPromptTemplate.from_messages([
        HumanMessage(content=f"""
### Task
Given the following essay and prompt, suggest all possible evaluation traits that would be appropriate for assessing this essay.

Then, **group** the suggested traits such that:
- Traits that are closely related and should be evaluated together are grouped under a single agent.
- Traits that are independent are assigned to separate agents.

**Reason for Grouping:**
Grouping related traits together ensures consistency, avoids redundant evaluations, and maintains contextual coherence, leading to more efficient and accurate assessments.

### For Each Trait
For each suggested trait, describe the detailed evaluation criteria that the evaluator must consider. The evaluator should look for specific features or aspects of the writing based on the provided rubric.

You will also be provided with the score range (e.g., 1-6). When scoring, evaluators must match the writing characteristics against the provided rubric levels.

### Output Format (in JSON)
{{
  "reasoning": "Provide a reasoning why these traits were suggested, considering the essay's purpose, genre, characteristics, etc.",
  "traits": ["Trait1", "Trait2", "Trait3", ...],
  "trait_guidelines": {{
    "Trait1": ["Evaluation point 1", "Evaluation point 2", ...],
    "Trait2": ["Evaluation point 1", ...],
    ...
  }},
  "grouped_traits": {{
    "GroupName1": ["Trait1", "Trait2", ...],
    "GroupName2": ["Trait3"],
    ...
  }}
}}

### Essay Prompt
{prompt}

### Essay
{essay}
""")
    ])
    response = llm.invoke(auto_trait_prompt.format_messages())
    traits_json = json.loads(response.content)
    traits = traits_json["traits"]
    grouped_traits = traits_json["grouped_traits"]


    # Trait별 페르소나 생성
    trait_personas = {}
    for group_name, trait_list in grouped_traits.items():
        persona_description = f"You are an expert evaluator for the trait group: {group_name}. Your job is to evaluate essays based on the following traits: {', '.join(trait_list)}."
        trait_personas[group_name] = persona_description

    return trait_personas

# 2. Trait별 채점 Agent
class TraitScoringAgent:
    def __init__(self, persona_description):
        self.llm = ChatOpenAI(model="gpt-4o")
        self.persona_description = persona_description

    def score(self, essay, prompt):
        scoring_prompt = ChatPromptTemplate.from_messages([
            SystemMessage(content=self.persona_description),
            HumanMessage(content=f"Essay Prompt: {prompt}\nEssay: {essay}\nPlease evaluate the essay according to your assigned traits.")
        ])
        response = self.llm.invoke(scoring_prompt.format_messages())
        return response.content

# 3. Holistic Scorer Agent
class HolisticScorer:
    def __init__(self):
        self.llm = ChatOpenAI(model="gpt-4o")

    def aggregate_scores(self, trait_scores):
        holistic_prompt = ChatPromptTemplate.from_messages([
            SystemMessage(content="You are a holistic essay scorer."),
            HumanMessage(content=f"""
            Based on the following trait group scores:
            {trait_scores}
            Provide a holistic overall score (0-100) and a short reasoning.
            """)
        ])
        response = self.llm.invoke(holistic_prompt.format_messages())
        return response.content

# 전체 프로세스
class EssayEvaluationSystem:
    def __init__(self, traits=None):
        self.traits = traits

    def evaluate(self, essay, prompt):
        # Step 1: Create trait personas
        trait_personas = create_trait_personas(essay, prompt, self.traits)

        # Step 2: Each group agent scores
        trait_scores = {}
        for group_name, persona in trait_personas.items():
            agent = TraitScoringAgent(persona)
            trait_scores[group_name] = agent.score(essay, prompt)

        # Step 3: Holistic scoring
        holistic_agent = HolisticScorer()
        holistic_result = holistic_agent.aggregate_scores(trait_scores)

        return trait_scores, holistic_result

In [None]:
import os
import json
from typing import List, Dict, Any, Optional
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.output_parsers import JsonOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

# Configuration
class EssayEvaluationConfig:
    def __init__(
        self,
        n_agents: int = 4,
        max_score: int = 10,
        min_score: int = 1,
        max_holistic_score: int = 100,
        min_holistic_score: int = 0,
        feedback: bool = True,
        model_name: str = "gpt-4o"
    ):
        self.n_agents = n_agents
        self.max_score = max_score
        self.min_score = min_score
        self.max_holistic_score = max_holistic_score
        self.min_holistic_score = min_holistic_score
        self.feedback = feedback
        self.model_name = model_name


class PersonaAgent:
    """Agent that generates a specialized evaluator persona based on essay and prompt"""
    
    def __init__(self, config: EssayEvaluationConfig):
        self.config = config
        self.llm = ChatOpenAI(model=config.model_name, temperature=0.7)
        
    def generate_personas(self, essay: str, essay_prompt: str) -> List[Dict[str, str]]:
        persona_template = """
        You are an expert at creating specialized personas for evaluating essays.
        
        Given the following essay and its prompt, create {n_agents} distinct evaluator personas.
        
        These personas MUST include experts focused on:
        1. Essay structure and grammar/mechanics
        2. Content and subject matter
        3. Alignment with the original prompt requirements
        4. The purpose/style of the essay (e.g., persuasive, narrative, etc.)
        
        Essay Prompt:
        {essay_prompt}
        
        Essay:
        {essay}
        
        For each persona, provide:
        1. A name
        2. Professional background
        3. Specific area of expertise
        4. Evaluation focus
        
        Format your response as a JSON array of persona objects with keys: "name", "background", "expertise", "focus"
        """
        
        persona_prompt = ChatPromptTemplate.from_template(persona_template)
        persona_chain = (
            {"essay_prompt": RunnablePassthrough(), "essay": RunnablePassthrough(), "n_agents": lambda _: self.config.n_agents}
            | persona_prompt
            | self.llm
            | JsonOutputParser()
        )
        
        return persona_chain.invoke({"essay_prompt": essay_prompt, "essay": essay})


class RubricAgent:
    """Agent that generates evaluation rubrics based on personas"""
    
    def __init__(self, config: EssayEvaluationConfig):
        self.config = config
        self.llm = ChatOpenAI(model=config.model_name, temperature=0.5)
        
    def generate_rubrics(self, personas: List[Dict[str, str]], essay: str, essay_prompt: str) -> List[Dict[str, Any]]:
        rubrics = []
        
        rubric_template = """
        You are {name}, {background} with expertise in {expertise}.
        
        Your task is to create a detailed evaluation rubric for assessing an essay. The rubric should focus on your specific area of expertise: {focus}.
        
        Essay Prompt:
        {essay_prompt}
        
        Essay to Evaluate:
        {essay}
        
        Create a rubric with 3-5 specific traits that evaluate aspects of the essay within your area of expertise.
        
        For each trait:
        - Provide a clear name
        - Give a detailed description of what this trait measures
        - Include specific criteria for different score levels within the range of {min_score} (lowest) to {max_score} (highest)
        
        Format your response as a JSON object with the following structure:
        {{
            "persona": {{
                "name": "Your persona name",
                "focus": "Your area of focus"
            }},
            "traits": [
                {{
                    "name": "Name of trait",
                    "description": "Description of trait",
                    "criteria": [
                        {{
                            "score": score_value,
                            "description": "What this score means"
                        }},
                        ...more score criteria...
                    ]
                }},
                ...more traits...
            ]
        }}
        """
        
        for persona in personas:
            rubric_prompt = ChatPromptTemplate.from_template(rubric_template)
            rubric_chain = (
                {
                    "name": lambda _: persona["name"],
                    "background": lambda _: persona["background"],
                    "expertise": lambda _: persona["expertise"],
                    "focus": lambda _: persona["focus"],
                    "essay_prompt": RunnablePassthrough(),
                    "essay": RunnablePassthrough(),
                    "min_score": lambda _: self.config.min_score,
                    "max_score": lambda _: self.config.max_score
                }
                | rubric_prompt
                | self.llm
                | JsonOutputParser()
            )
            
            rubric = rubric_chain.invoke({"essay_prompt": essay_prompt, "essay": essay})
            rubrics.append(rubric)
            
        return rubrics


class ScoringAgent:
    """Agent that scores essays based on persona-specific rubrics"""
    
    def __init__(self, config: EssayEvaluationConfig):
        self.config = config
        self.llm = ChatOpenAI(model=config.model_name, temperature=0.2)
        
    def generate_scores(self, rubrics: List[Dict[str, Any]], essay: str, essay_prompt: str) -> List[Dict[str, Any]]:
        all_scores = []
        
        scoring_template = """
        You are {persona_name}, focusing on evaluating essays from the perspective of {persona_focus}.
        
        Your task is to evaluate the following essay according to your specialized rubric.
        
        Essay Prompt:
        {essay_prompt}
        
        Essay to Evaluate:
        {essay}
        
        Your evaluation rubric has the following traits:
        {traits_json}
        
        For each trait in your rubric:
        1. First, provide detailed reasoning for your evaluation
        2. Then, assign a score within the range {min_score} to {max_score}
        
        Format your response as a JSON object with the following structure:
        {{
            "persona": {{
                "name": "Your persona name",
                "focus": "Your area of focus"
            }},
            "trait_scores": [
                {{
                    "trait_name": "Name of trait",
                    "rationale": "Detailed explanation of your reasoning",
                    "score": assigned_score
                }},
                ...more trait scores...
            ]
        }}
        """
        
        for rubric in rubrics:
            persona_name = rubric["persona"]["name"]
            persona_focus = rubric["persona"]["focus"]
            traits_json = json.dumps(rubric["traits"])
            
            scoring_prompt = ChatPromptTemplate.from_template(scoring_template)
            scoring_chain = (
                {
                    "persona_name": lambda _: persona_name,
                    "persona_focus": lambda _: persona_focus,
                    "essay_prompt": RunnablePassthrough(),
                    "essay": RunnablePassthrough(),
                    "traits_json": lambda _: traits_json,
                    "min_score": lambda _: self.config.min_score,
                    "max_score": lambda _: self.config.max_score
                }
                | scoring_prompt
                | self.llm
                | JsonOutputParser()
            )
            
            scores = scoring_chain.invoke({"essay_prompt": essay_prompt, "essay": essay})
            all_scores.append(scores)
            
        return all_scores


class MetaScoreAgent:
    """Agent that aggregates scores from multiple evaluators into a final assessment"""
    
    def __init__(self, config: EssayEvaluationConfig):
        self.config = config
        self.llm = ChatOpenAI(model=config.model_name, temperature=0.3)
        
    def generate_meta_score(self, all_scores: List[Dict[str, Any]], essay: str, essay_prompt: str) -> Dict[str, Any]:
        meta_template = """
        You are a Meta Evaluator responsible for synthesizing multiple expert evaluations into a coherent final assessment.
        
        Your task is to review evaluations from {n_agents} expert personas and produce a comprehensive final score and assessment.
        
        Essay Prompt:
        {essay_prompt}
        
        Essay:
        {essay}
        
        Expert Evaluations:
        {evaluations_json}
        
        Please analyze these evaluations to:
        1. Identify unique evaluation traits across all personas
        2. Determine appropriate weight for each trait based on its importance
        3. Calculate a final holistic score within the range of {min_holistic_score} to {max_holistic_score}
        4. {feedback_instruction}
        
        Format your response as a JSON object with the following structure:
        {{
            "trait_summary": [
                {{
                    "trait": "Name of trait",
                    "focus": "Related focus area",
                    "score": normalized_score,
                    "weight": assigned_weight
                }},
                ...more traits...
            ],
            "holistic_score": final_score,
            "feedback": "Comprehensive feedback" // Only if feedback is required
        }}
        """
        
        feedback_instruction = "Provide comprehensive feedback with strengths and areas for improvement" if self.config.feedback else "No feedback needed"
        
        meta_prompt = ChatPromptTemplate.from_template(meta_template)
        meta_chain = (
            {
                "n_agents": lambda _: self.config.n_agents,
                "essay_prompt": RunnablePassthrough(),
                "essay": RunnablePassthrough(),
                "evaluations_json": lambda _: json.dumps(all_scores),
                "min_holistic_score": lambda _: self.config.min_holistic_score,
                "max_holistic_score": lambda _: self.config.max_holistic_score,
                "feedback_instruction": lambda _: feedback_instruction
            }
            | meta_prompt
            | self.llm
            | JsonOutputParser()
        )
        
        return meta_chain.invoke({"essay_prompt": essay_prompt, "essay": essay})


class EssayEvaluationSystem:
    """Main system that orchestrates the multi-agent evaluation process"""
    
    def __init__(self, api_key: str = None, config: Optional[EssayEvaluationConfig] = None):
        if api_key:
            os.environ["OPENAI_API_KEY"] = api_key
            
        self.config = config or EssayEvaluationConfig()
        self.persona_agent = PersonaAgent(self.config)
        self.rubric_agent = RubricAgent(self.config)
        self.scoring_agent = ScoringAgent(self.config)
        self.meta_agent = MetaScoreAgent(self.config)
        
    def evaluate_essay(self, essay: str, essay_prompt: str) -> Dict[str, Any]:
        """Full evaluation pipeline that returns structured assessment of an essay"""
        
        # Step 1: Generate specialized personas
        personas = self.persona_agent.generate_personas(essay, essay_prompt)
        
        # Step 2: Generate rubrics for each persona
        rubrics = self.rubric_agent.generate_rubrics(personas, essay, essay_prompt)
        
        # Step 3: Score essay using each persona's rubric
        all_scores = self.scoring_agent.generate_scores(rubrics, essay, essay_prompt)
        
        # Step 4: Generate meta-assessment with final score
        final_assessment = self.meta_agent.generate_meta_score(all_scores, essay, essay_prompt)
        
        # Compile complete evaluation result
        evaluation_result = {
            "personas": personas,
            "rubrics": rubrics,
            "detailed_scores": all_scores,
            "final_assessment": final_assessment
        }
        
        return evaluation_result


In [None]:


# Example usage
if __name__ == "__main__":
    # Configuration
    config = EssayEvaluationConfig(
        n_agents=4,
        max_score=5,
        min_score=1,
        max_holistic_score=100,
        min_holistic_score=0,
        feedback=True,
        model_name="gpt-4o"
    )
    
    # Initialize system with OpenAI API key
    system = EssayEvaluationSystem(
        api_key="your_openai_api_key_here",  # Replace with your API key
        config=config
    )
    
    # Sample essay and prompt
    essay_prompt = "Discuss the impact of artificial intelligence on modern education."
    
    essay = """
    Artificial Intelligence: Reshaping Education
    
    The integration of artificial intelligence in education represents one of the most significant technological shifts in modern pedagogy. As AI systems become more sophisticated, they are transforming how students learn and how educators teach.
    
    Personalized learning stands as perhaps the most promising application of AI in education. Traditional classroom models often struggle to address the diverse needs of many students simultaneously. AI-powered platforms can analyze individual student performance, identify knowledge gaps, and adapt content delivery to match each student's learning pace and style. This personalization helps struggling students receive the additional support they need while allowing advanced learners to progress at an accelerated rate.
    
    Assessment is another area where AI demonstrates valuable potential. Automated grading systems can evaluate objective assignments instantaneously, freeing educators from time-consuming tasks and providing students with immediate feedback. More sophisticated AI tools are beginning to assess complex work like essays, analyzing factors including structure, argumentation, and coherence. While not replacing human judgment, these systems offer preliminary evaluations that help teachers manage large class loads more effectively.
    
    Administrative efficiency also improves with AI implementation. Institutions are deploying AI to streamline enrollment processes, schedule classes, and manage resources. These administrative applications allow educational institutions to operate more efficiently, potentially redirecting resources toward improving educational quality.
    
    However, the integration of AI in education raises important concerns. The digital divide may widen as schools with greater resources adopt advanced AI tools while underfunded institutions fall further behind. Questions about data privacy emerge as AI systems collect extensive information about students' learning behaviors and personal characteristics. Additionally, overreliance on technology might diminish crucial human elements of education—particularly the mentorship, inspiration, and emotional intelligence that skilled teachers provide.
    
    In conclusion, AI holds transformative potential for education while presenting significant challenges. The most successful educational futures will likely be those that thoughtfully integrate AI capabilities with irreplaceable human guidance, creating balanced learning environments that leverage technological advantages while preserving the essential human connections that give education its deepest value.
    """
    
    # Evaluate essay
    evaluation_result = system.evaluate_essay(essay, essay_prompt)
    
    # Print formatted results
    print(json.dumps(evaluation_result, indent=2))