In [None]:
############################## gemini  ##################################################

In [None]:
!pip install evaluate
!pip uninstall pyarrow -y
!pip install pyarrow --upgrade
!conda install -c conda-forge pyarrow


Found existing installation: pyarrow 21.0.0
Uninstalling pyarrow-21.0.0:
  Successfully uninstalled pyarrow-21.0.0
Collecting pyarrow
  Using cached pyarrow-21.0.0-cp312-cp312-win_amd64.whl.metadata (3.4 kB)
Using cached pyarrow-21.0.0-cp312-cp312-win_amd64.whl (26.2 MB)
Installing collected packages: pyarrow
Successfully installed pyarrow-21.0.0


In [21]:
import google.generativeai as genai
import os
from bs4 import BeautifulSoup
import evaluate # For ROUGE and Perplexity evaluation
from typing import List, Dict, Tuple
from evaluate import load as evaluate_load # For loading metrics

# --- 1. CONFIGURATION ---
# ⚠️ REPLACE WITH YOUR ACTUAL API KEY ⚠️
API_KEY = "AIzaSyDSlwTLHc2FDJUZ5w5-S_ZltEFWBmvAUTQ"  # Replace with your actual key
try:
    genai.configure(api_key=API_KEY)
except Exception as e:
    # A simple exit is better than proceeding with a bad key
    print(f"Error configuring API: {e}. Please check your API key.")
    # You might need to exit the script here in a real application

MODEL_NAME = "gemini-2.5-flash"
model = genai.GenerativeModel(MODEL_NAME)

OUTPUT_HTML_FILE = "combined_output.html"
OUTPUT_TEXT_FILE = "combined_ticket_data.txt"

# --- Dummy Data Setup (for a runnable example) ---
html_doc = """
<html><body>
<p>Ticket 1: My laptop's screen is flickering after the new software update. It is critical!</p>
<p>Ticket 2: I need access to the Marketing share drive please. This is low priority.</p>
<p>Ticket 3: The VPN connection keeps dropping every 10 minutes. I can't work.</p>
</body></html>
"""
combined_soup = BeautifulSoup(html_doc, 'html.parser')

# --- 2. HTML COMBINATION & TEXT EXTRACTION ---

print("--- Step 1: Combining HTML Files and Extracting Text ---")

# Save the combined HTML (from your original request)
with open(OUTPUT_HTML_FILE, "w", encoding="utf-8") as f:
    f.write(combined_soup.prettify())

# Extract all relevant text from the combined document
all_ticket_text = combined_soup.get_text(separator="\n", strip=True)

# Save the extracted text to a file (optional)
with open(OUTPUT_TEXT_FILE, "w", encoding="utf-8") as f:
    f.write(all_ticket_text)
print(f"Extracted text data saved to: {OUTPUT_TEXT_FILE}")

# --- 3. PROMPT GENERATION AND EXECUTION ---

print("\n--- Step 2: Generating and Executing Advanced Prompts ---")

# The specific ticket we will classify for the Few-Shot and COT examples
TICKET_TO_CLASSIFY = "The VPN connection keeps dropping every 10 minutes. I can't work."

# --------------------------------------------------------------------------
# GOLD STANDARD REFERENCE (Ground Truth) for ROUGE Evaluation
# --------------------------------------------------------------------------
REFERENCE_ANSWER: List[str] = ["Network"]

# --------------------------------------------------------------------------
# PROMPT TYPE A: ZERO-SHOT PROMPTING 
# --------------------------------------------------------------------------
zero_shot_prompt = f"""
Classify the following ticket into one of these categories: [Hardware, Software, Access Request, Network].
TICKET: "{TICKET_TO_CLASSIFY}"
Classification:
"""

# --------------------------------------------------------------------------
# PROMPT TYPE B: FEW-SHOT PROMPTING 
# --------------------------------------------------------------------------
few_shot_prompt = f"""
Here are some examples of ticket classification:

Example 1:
TICKET: "My monitor has a thin vertical green line on the right side."
Classification: Hardware

Example 2:
TICKET: "Please add me to the 'External Partners' distribution list."
Classification: Access Request

Now, classify this new ticket:
TICKET: "{TICKET_TO_CLASSIFY}"
Classification:
"""

# --------------------------------------------------------------------------
# PROMPT TYPE C: CHAIN-OF-THOUGHT (COT) PROMPTING 
# --------------------------------------------------------------------------
cot_prompt = f"""
**Instructions:** First, analyze the TICKET and describe the core issue and its impact. Second, determine the correct classification from the list [Hardware, Software, Access Request, Network]. Third, output ONLY the final Classification.

TICKET: "{TICKET_TO_CLASSIFY}"

**Reasoning Process:**
"""

# --- Initialize Metrics ---
rouge = evaluate_load("rouge")
# Load the Perplexity metric using 'gpt2' as the external evaluation model
# We set keep_in_memory=True to potentially avoid re-loading the model in a loop,
# though the warnings suggest the issue is primarily environmental.
perplexity_metric = evaluate_load("perplexity", module_type="metric", keep_in_memory=True) 

# --- Define Evaluation Functions ---
def calculate_rouge_l(candidate_text: str, reference_texts: List[str]) -> float:
    """Calculates the ROUGE-L F1 score."""
    try:
        # Check if candidate_text is effectively empty after stripping
        if not candidate_text.strip():
             return 0.0
             
        results = rouge.compute(
            predictions=[candidate_text],
            references=reference_texts,
            use_stemmer=True
        )
        return results["rougeL"].mid.fmeasure
    except Exception as e:
        print(f"Error during ROUGE calculation: {e}")
        return 0.0

def calculate_perplexity(candidate_text: str) -> float:
    """Calculates the Perplexity score using a pre-trained model (gpt2)."""
    # Only calculate if the text is not empty
    if not candidate_text.strip():
        return float('inf')
        
    try:
        # PPL expects a list of strings
        # Setting batch_size=1 helps with simple, single-sequence inputs
        results = perplexity_metric.compute(
            model_id='gpt2', 
            predictions=[candidate_text],
            batch_size=1
        )
        # The result is a list of PPL values; we take the first one.
        return results['perplexities'][0]
    except Exception as e:
        # This may occur if the text is empty or the model fails to load
        print(f"Error during Perplexity calculation (returning inf): {e}")
        return float('inf') # Return infinity for a failed calculation

# --- Execute Prompts and Evaluate ---
prompts_list: List[Tuple[str, str]] = [
    ("A. Zero-Shot", zero_shot_prompt),
    ("B. Few-Shot", few_shot_prompt),
    ("C. Chain-of-Thought (COT)", cot_prompt)
]

for name, prompt in prompts_list:
    print(f"\n{'='*70}")
    print(f"EXECUTING PROMPT TYPE: {name}")
    print(f"{'-'*70}")
    
    candidate_response = ""
    clean_response = ""
    try:
        # 1. Generate Content
        response = model.generate_content(prompt)
        candidate_response = response.text.strip()
        
        print("PROMPT SENT (Snippet):")
        print(prompt.strip().replace('\n', ' ')[:100] + "...") 
        print("\nGEMINI RESPONSE (Full):")
        print(candidate_response)
        
        # 2. Clean Response for ROUGE (Classification Only)
        # This logic is designed to extract the final category string
        if "Chain-of-Thought" in name:
            # For COT, search for the last non-empty line as the final classification
            lines = [line.strip() for line in candidate_response.split('\n') if line.strip()]
            clean_response = lines[-1].replace("Classification:", "").replace("Classification", "").strip() if lines else candidate_response
        else:
            # For Zero/Few-Shot, clean the whole response
            clean_response = candidate_response.replace("Classification:", "").strip()
        
        # Strip potential markdown bolding characters
        clean_response = clean_response.strip('*')

        # 3. EVALUATION
        rouge_l_f1 = calculate_rouge_l(clean_response, REFERENCE_ANSWER)
        # Use the FULL response for PPL to evaluate overall fluency/coherence
        perplexity_score = calculate_perplexity(candidate_response) 
        
        # 4. Print Results
        print(f"\n{'*'*70}")
        print(f"EVALUATION RESULTS for {name}:")
        print(f"  -> Reference Answer (Classification): {REFERENCE_ANSWER[0]}")
        print(f"  -> Model's Cleaned Classification:    {clean_response}")
        print(f"  -> ROUGE-L F1 Score:                {rouge_l_f1:.4f} (Measures Content Overlap, Max 1.0000)")
        print(f"  -> Perplexity (PPL) Score:          {perplexity_score:.2f} (Measures Fluency/Confidence, Lower is Better)")
        print(f"{'*'*70}")

    except Exception as e:
        print(f"An error occurred (Error Type: {type(e).__name__}): {e}")
        print("Skipping evaluation for this prompt.")

--- Step 1: Combining HTML Files and Extracting Text ---
Extracted text data saved to: combined_ticket_data.txt

--- Step 2: Generating and Executing Advanced Prompts ---

EXECUTING PROMPT TYPE: A. Zero-Shot
----------------------------------------------------------------------
PROMPT SENT (Snippet):
Classify the following ticket into one of these categories: [Hardware, Software, Access Request, Net...

GEMINI RESPONSE (Full):
Classification: **Network**
Error during ROUGE calculation: 'numpy.float64' object has no attribute 'mid'
Error during Perplexity calculation (returning inf): Asking to pad but the tokenizer does not have a padding token. Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`.

**********************************************************************
EVALUATION RESULTS for A. Zero-Shot:
  -> Reference Answer (Classification): Network
  -> Model's

In [None]:



##################################OpenAIcode#################################################

In [23]:
import os
from openai import OpenAI # Import the OpenAI library
from bs4 import BeautifulSoup
import evaluate # For ROUGE and Perplexity evaluation
from typing import List, Dict, Tuple
from evaluate import load as evaluate_load

# --- 1. CONFIGURATION ---
# IMPORTANT: It is best practice to load the API key from an environment variable.
# Replace the placeholder below with your actual key if not using an env var, or
# set the OPENAI_API_KEY environment variable.
# For this script to run successfully, you MUST replace the placeholder.
API_KEY = "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"
if API_KEY == "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":
    # Attempt to load from environment variable as a fallback
    API_KEY = os.environ.get("OPENAI_API_KEY", API_KEY) 

try:
    # Initialize the OpenAI Client
    client = OpenAI(api_key=API_KEY)
except Exception as e:
    print(f"Error initializing OpenAI client: {e}")
    # Exit or raise error if client initialization fails in a real application

# Using a comparable OpenAI model for the task
MODEL_NAME = "gpt-3.5-turbo" 

OUTPUT_HTML_FILE = "combined_output.html"
OUTPUT_TEXT_FILE = "combined_ticket_data.txt"

# --- Dummy Data Setup (for a runnable example) ---
html_doc = """
<html><body>
<p>Ticket 1: My laptop's screen is flickering after the new software update. It is critical!</p>
<p>Ticket 2: I need access to the Marketing share drive please. This is low priority.</p>
<p>Ticket 3: The VPN connection keeps dropping every 10 minutes. I can't work.</p>
</body></html>
"""
combined_soup = BeautifulSoup(html_doc, 'html.parser')

# --- 2. HTML COMBINATION & TEXT EXTRACTION (Same as previous script) ---

print("--- Step 1: Combining HTML Files and Extracting Text ---")

with open(OUTPUT_HTML_FILE, "w", encoding="utf-8") as f:
    f.write(combined_soup.prettify())

all_ticket_text = combined_soup.get_text(separator="\n", strip=True)

with open(OUTPUT_TEXT_FILE, "w", encoding="utf-8") as f:
    f.write(all_ticket_text)
print(f"Extracted text data saved to: {OUTPUT_TEXT_FILE}")

# --- 3. PROMPT GENERATION AND EXECUTION ---

print("\n--- Step 2: Generating and Executing Advanced Prompts ---")

TICKET_TO_CLASSIFY = "The VPN connection keeps dropping every 10 minutes. I can't work."

# --------------------------------------------------------------------------
# GOLD STANDARD REFERENCE (Ground Truth) for ROUGE Evaluation
# --------------------------------------------------------------------------
REFERENCE_ANSWER: List[str] = ["Network"]

# --------------------------------------------------------------------------
# PROMPT TYPE A: ZERO-SHOT PROMPTING 🎯
# --------------------------------------------------------------------------
zero_shot_prompt = f"""
Classify the following ticket into one of these categories: [Hardware, Software, Access Request, Network].
TICKET: "{TICKET_TO_CLASSIFY}"
Classification:
"""

# --------------------------------------------------------------------------
# PROMPT TYPE B: FEW-SHOT PROMPTING 📝
# --------------------------------------------------------------------------
few_shot_prompt = f"""
Here are some examples of ticket classification:

Example 1:
TICKET: "My monitor has a thin vertical green line on the right side."
Classification: Hardware

Example 2:
TICKET: "Please add me to the 'External Partners' distribution list."
Classification: Access Request

Now, classify this new ticket:
TICKET: "{TICKET_TO_CLASSIFY}"
Classification:
"""

# --------------------------------------------------------------------------
# PROMPT TYPE C: CHAIN-OF-THOUGHT (COT) PROMPTING 🤔
# --------------------------------------------------------------------------
cot_prompt = f"""
**Instructions:** First, analyze the TICKET and describe the core issue and its impact. Second, determine the correct classification from the list [Hardware, Software, Access Request, Network]. Third, output ONLY the final Classification.

TICKET: "{TICKET_TO_CLASSIFY}"

**Reasoning Process:**
"""

# --- Initialize Metrics (ROUGE and Perplexity) ---
# We still use the 'evaluate' library for metrics, as it is language model agnostic
rouge = evaluate_load("rouge")
perplexity_metric = evaluate_load("perplexity", module_type="metric", keep_in_memory=True) 

# --- Define Evaluation Functions (Identical to previous script) ---
def calculate_rouge_l(candidate_text: str, reference_texts: List[str]) -> float:
    """Calculates the ROUGE-L F1 score."""
    try:
        if not candidate_text.strip():
             return 0.0
             
        results = rouge.compute(
            predictions=[candidate_text],
            references=reference_texts,
            use_stemmer=True
        )
        # ROUGE-L F1 score is typically found under 'rougeL' mid fmeasure
        return results["rougeL"].mid.fmeasure
    except Exception as e:
        print(f"Error during ROUGE calculation: {e}")
        return 0.0

def calculate_perplexity(candidate_text: str) -> float:
    """Calculates the Perplexity score using a pre-trained model (gpt2)."""
    if not candidate_text.strip():
        return float('inf')
        
    try:
        # PPL calculation uses GPT-2 for evaluation, not the main classification model (GPT-3.5-turbo)
        results = perplexity_metric.compute(
            model_id='gpt2', 
            predictions=[candidate_text],
            batch_size=1
        )
        return results['perplexities'][0]
    except Exception as e:
        # The warnings seen previously might lead here if the model fails to load
        print(f"Error during Perplexity calculation (returning inf): {e}")
        return float('inf')

# --- Execute Prompts and Evaluate ---
prompts_list: List[Tuple[str, str]] = [
    ("A. Zero-Shot", zero_shot_prompt),
    ("B. Few-Shot", few_shot_prompt),
    ("C. Chain-of-Thought (COT)", cot_prompt)
]

for name, prompt in prompts_list:
    print(f"\n{'='*70}")
    print(f"EXECUTING PROMPT TYPE: {name}")
    print(f"{'-'*70}")
    
    candidate_response = ""
    clean_response = ""
    
    if API_KEY == "YOUR_OPENAI_API_KEY_HERE":
        print("!! WARNING: OpenAI API Key not configured. Skipping API call. !!")
        print("Please set your OPENAI_API_KEY environment variable or replace the placeholder.")
        continue
        
    try:
        # 1. Generate Content using OpenAI API
        # Using a low temperature for deterministic classification tasks
        response = client.chat.completions.create(
            model=MODEL_NAME,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.0
        )
        candidate_response = response.choices[0].message.content.strip()
        
        print("PROMPT SENT (Snippet):")
        print(prompt.strip().replace('\n', ' ')[:100] + "...") 
        print("\nOPENAI RESPONSE (Full):")
        print(candidate_response)
        
        # 2. Clean Response for ROUGE (Classification Only)
        if "Chain-of-Thought" in name:
            # For COT, look for the final classification on the last non-empty line
            lines = [line.strip() for line in candidate_response.split('\n') if line.strip()]
            clean_response = lines[-1].replace("Classification:", "").replace("Classification", "").strip() if lines else candidate_response
        else:
            # For Zero/Few-Shot, clean the whole response
            clean_response = candidate_response.replace("Classification:", "").strip()
        
        # Strip potential markdown bolding characters
        clean_response = clean_response.strip('*')

        # 3. EVALUATION
        rouge_l_f1 = calculate_rouge_l(clean_response, REFERENCE_ANSWER)
        # Use the FULL response for PPL to evaluate overall fluency/coherence
        perplexity_score = calculate_perplexity(candidate_response) 
        
        # 4. Print Results
        print(f"\n{'*'*70}")
        print(f"EVALUATION RESULTS for {name} (Model: {MODEL_NAME}):")
        print(f"  -> Reference Answer (Classification): {REFERENCE_ANSWER[0]}")
        print(f"  -> Model's Cleaned Classification:    {clean_response}")
        print(f"  -> ROUGE-L F1 Score:                {rouge_l_f1:.4f} (Measures Content Overlap, Max 1.0000)")
        print(f"  -> Perplexity (PPL) Score:          {perplexity_score:.2f} (Measures Fluency/Confidence, Lower is Better)")
        print(f"{'*'*70}")

    except Exception as e:
        # Catch API errors (e.g., authentication, rate limits)
        print(f"An API or execution error occurred (Error Type: {type(e).__name__}): {e}")
        print("Skipping evaluation for this prompt.")
        


--- Step 1: Combining HTML Files and Extracting Text ---
Extracted text data saved to: combined_ticket_data.txt

--- Step 2: Generating and Executing Advanced Prompts ---

EXECUTING PROMPT TYPE: A. Zero-Shot
----------------------------------------------------------------------
An API or execution error occurred (Error Type: RateLimitError): Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.', 'type': 'insufficient_quota', 'param': None, 'code': 'insufficient_quota'}}
Skipping evaluation for this prompt.

EXECUTING PROMPT TYPE: B. Few-Shot
----------------------------------------------------------------------
An API or execution error occurred (Error Type: RateLimitError): Error code: 429 - {'error': {'message': 'You exceeded your current quota, please check your plan and billing details. For more informati