In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import json
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
# import scispacy
import re
import tqdm
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.util import ngrams
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from bertopic import BERTopic
from openai import OpenAI
import hdbscan
import os
from dotenv import load_dotenv


  from .autonotebook import tqdm as notebook_tqdm


In [13]:
load_dotenv()

client = OpenAI(
    api_key=os.getenv("OPENAI_API_KEY"))

### Summarization using LLMs

In [6]:
messages = pd.read_csv('messages.csv')  # Load your messages dataframe


In [7]:
msg = messages.iloc[1]['content']  # Example message for testing
msg

"1. What is your job title/department? Animal Care Specialist at Riverside County Animal Services.\n2. Who is the department supervisor? Robert Brown\n3. Brief description of accident: As I was exiting the dog's kennel he got excited and and started jumping up and biting at me.\n4. Size of the dog and location on body that you were bitten: Large sized dog and under my right upper arm.\n5. Prior injury to the affected body part? No\n6. Any medical conditions that require maintenance medication? No\n7. Have you missed any time from work as a result of this accident? Just 2 hours on the date of the accident (6/3/24) because I left earlier to see a doctor.\n\nThanks!\n-Sarah Cooper"

In [8]:
SYSTEM = "You extract a short SUBJECT (≤5 words) and a 1 sentence SUMMARY for insurance claim messages. Return JSON."
user = f"Message:\n\"\"\"\n{msg}\n\"\"\"\nRespond as: {{\"subject\":\"...\",\"summary\":\"...\"}}"


In [14]:
resp = client.chat.completions.create(
    model="gpt-4o-mini",  # or your available model
    messages=[{"role":"system","content":SYSTEM},{"role":"user","content":user}],
    # avoid temperature if your model complains; otherwise use 0–0.2 for stability
    max_tokens=120
)
data = json.loads(resp.choices[0].message.content)

In [15]:
data

{'subject': 'Dog Bite Incident',
 'summary': 'Sarah Cooper, an Animal Care Specialist, was bitten by a large dog under her right upper arm while exiting the kennel.'}

In [25]:
def summarize_and_extract_subject_and_summary(conversation, model="gpt-4o-mini"):
    """
    Summarizes a conversation and extracts a short subject line.
    
    Args:
        conversation (str): Full text of the conversation or message thread.
        model (str): OpenAI model to use.

    Returns:
        dict: { "subject": str, "summary": str }
    """
    prompt = f"""
    You are given a conversation between a claimant and adjuster.
    1. Provide a short subject line (max 8 words) summarizing the main topic. 
    2. Provide a concise summary (1 sentence) covering the key points.

    Provide in JSON format:
    {{
        "subject": "<short subject line>",
        "summary": "<concise summary>"
    }}

    Conversation:
    {conversation}
    """

    completion = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.3
    )

    response_text = completion.choices[0].message.content.strip()

    # Optional: crude split if model outputs in Subject/Summary format
    if "subject" in response_text and "summary" in response_text:
        try:
            response_json = json.loads(response_text)
            subject = response_json.get("subject", "")
            summary = response_json.get("summary", "")
        except json.JSONDecodeError:
            # Fallback if JSON parsing fails
            parts = response_text.split("\n")
            subject = parts[0].replace("Subject:", "").strip()
            summary = parts[1].replace("Summary:", "").strip()
    else:
        # Fallback to manual extraction
        lines = response_text.split("\n")
        subject = lines[0].strip()
        summary = " ".join(line.strip() for line in lines[1:]).strip()

    return {"subject": subject, "summary": summary}

In [26]:
# Example usage:
conversation_text = """
Claimant: I sent the reimbursement form last week. Have you received it?
Adjuster: Yes, but it’s missing the mileage section.
Claimant: Okay, I’ll resend with the missing info.
"""
result = summarize_and_extract_subject_and_summary(conversation_text)
print(result)

{'subject': 'Reimbursement Form Missing Information', 'summary': 'The claimant needs to resend the reimbursement form due to a missing mileage section.'}


In [None]:
messages_subset = messages.head(50).copy()
messages_subset['subject'], messages_subset['summary'] = messages_subset['content'].apply(
    lambda x: summarize_and_extract_subject_and_summary(x, model="gpt-4o-mini")
)
