# Groq Assignment — Conversation Management & JSON Extraction
This notebook implements **Task 1 (conversation + summarization)** and **Task 2 (JSON schema extraction via function-calling)** as required.
Run cells in order. If you want real model calls, paste your GROQ API key when prompted in the Setup cell.

Setup: install & imports + API key prompt

In [1]:
# Install required packages
!pip install --quiet requests jsonschema

# Imports
import os, json, re, getpass, requests, textwrap
from datetime import datetime
from jsonschema import validate, ValidationError

#Helper: set Groq API key safely
def set_groq_key_interactive():
    """
    Interactive prompt in Colab to set GROQ_API_KEY into os.environ for the current session.
    If you press Enter with no input, notebook runs in MOCK mode (no network calls).
    """
    existing = os.getenv("GROQ_API_KEY")
    if existing:
        print("GROQ_API_KEY already set in environment (session).")
        return True
    key = getpass.getpass("Paste your GROQ API key here (press Enter to run in MOCK mode): ").strip()
    if key:
        os.environ["GROQ_API_KEY"] = key
        print("GROQ_API_KEY set for this session.")
        return True
    print("No key provided. Running in MOCK mode.")
    return False

def load_key_from_file(path="/content/groq_key.txt"):
    """
    Optional: save your key into /content/groq_key.txt (local Colab FS) and load it.
    Make sure you add that file to .gitignore before pushing anywhere.
    """
    if os.path.exists(path):
        with open(path, "r") as f:
            k = f.read().strip()
        if k:
            os.environ["GROQ_API_KEY"] = k
            print(f"Loaded GROQ_API_KEY from {path}")
            return True
    return False

# Call interactive setter
set_groq_key_interactive()

# we can use MOCK mode if no key
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
USE_MOCK = not bool(GROQ_API_KEY)
print("USE_MOCK =", USE_MOCK)


Paste your GROQ API key here (press Enter to run in MOCK mode): ··········
GROQ_API_KEY set for this session.
USE_MOCK = False


In [2]:
# Groq OpenAI-compatible endpoint & model
GROQ_CHAT_COMPLETIONS = "https://api.groq.com/openai/v1/chat/completions"
DEFAULT_MODEL = "openai/gpt-oss-20b"  # We can use different different models which are vaialable

HEADERS = {"Authorization": f"Bearer {GROQ_API_KEY}", "Content-Type": "application/json"} if GROQ_API_KEY else {}

ConversationStore class + summarization (supports mock mode)

In [3]:
class ConversationStore:
    """
    Stores conversation messages and supports:
    - truncation by turns / chars / words
    - periodic summarization after every k-th add_message call
    - mock summarizer when GROQ key not provided (safe demo)
    """
    def __init__(self):
        self.messages = []   # list of dicts: {"role","content","meta","ts"}
        self.archive = []    # append-only archive if needed
        self.counter = 0     # number of messages added (used for k-th summarization)

    def add_message(self, role, content, meta=None):
        meta = meta or {}
        msg = {
            "role": role,
            "content": content,
            "meta": meta,
            "ts": datetime.utcnow().isoformat()
        }
        self.messages.append(msg)
        self.archive.append(msg.copy())
        self.counter += 1
        return msg

    def pretty_print(self):
        print(f"\n--- Conversation ({len(self.messages)} messages) ---")
        for i, m in enumerate(self.messages, 1):
            mark = " [SUMMARY]" if m.get("meta", {}).get("is_summary") else ""
            print(f"{i:02d}. {m['role'].upper():9}{mark}: {m['content']}")
        print("-----------------------------------------------\n")

    # Truncation helpers
    def get_history_by_turns(self, n):
        return self.messages[-n:] if n > 0 else []

    def get_history_by_chars(self, max_chars):
        out = []
        total = 0
        for m in reversed(self.messages):
            l = len(m["content"])
            if total + l > max_chars and out:
                break
            out.insert(0, m)
            total += l
        return out

    def get_history_by_words(self, max_words):
        out = []
        total = 0
        for m in reversed(self.messages):
            wc = len(m["content"].split())
            if total + wc > max_words and out:
                break
            out.insert(0, m)
            total += wc
        return out

    # Summarization using Groq
    def summarize_via_groq(self, msgs, summary_words=100, max_tokens=256):
        """
        If GROQ key present, call Groq (OpenAI-compatible) chat completion to get summary.
        Otherwise, return a simple mock summary (first summary_words words).
        """
        full_text = "\n".join([f"{m['role'].upper()}: {m['content']}" for m in msgs])
        if USE_MOCK:
            tokens = full_text.split()
            s = " ".join(tokens[:summary_words])
            return (s + ("..." if len(tokens) > summary_words else "")).strip()

        prompt = (
            f"Summarize the conversation below into concise bullets: key facts, decisions, and action items.\n"
            f"Keep it under {summary_words} words.\n\nConversation:\n{full_text}"
        )
        body = {
            "model": DEFAULT_MODEL,
            "messages": [
                {"role": "system", "content": "You are a concise summarizer."},
                {"role": "user", "content": prompt}
            ],
            "max_tokens": max_tokens,
            "temperature": 0.0
        }
        resp = requests.post(GROQ_CHAT_COMPLETIONS, headers=HEADERS, json=body, timeout=30)
        resp.raise_for_status()
        data = resp.json()
        # robust extraction of assistant text
        try:
            return data["choices"][0]["message"]["content"].strip()
        except Exception:
            return data.get("choices", [{}])[0].get("text", "").strip()

    def maybe_summarize(self, k=3, summary_words=100, keep_tail=3):
        """
        If counter % k == 0 => summarizes the *earlier* portion of history (everything except last keep_tail)
        and replaces it with a single system summary message + the last keep_tail messages.
        Returns the summary string if created, else None.
        """
        if k <= 0 or (self.counter % k != 0):
            return None
        if len(self.messages) <= keep_tail:
            return None

        to_summarize = self.messages[:-keep_tail]
        summary_text = self.summarize_via_groq(to_summarize, summary_words=summary_words)
        summary_msg = {
            "role": "system",
            "content": "[SUMMARY] " + summary_text,
            "meta": {"is_summary": True},
            "ts": datetime.utcnow().isoformat()
        }
        tail = self.messages[-keep_tail:]
        self.messages = [summary_msg] + tail
        return summary_text


## Task 1 demo: feed sample conversations, show truncation & k-th summarization

In [4]:
# Demo for Task 1
conv = ConversationStore()

sample_msgs = [
    ("user", "Hi, I'm Vijay. I want to build a conversation summarizer."),
    ("assistant", "Great — what language and runtime are you using?"),
    ("user", "Python on Colab for prototyping, then a VPS later."),
    ("assistant", "We can summarize periodically and extract structured fields from chats."),
    ("user", "Please ensure we preserve names, emails and action items."),
    ("assistant", "Sure — we'll keep entities intact and keep short bullets."),
    ("user", "I want summarization to run every 3rd message.")
]

k = 3   # summarization frequency
for role, txt in sample_msgs:
    conv.add_message(role, txt)
    print(f"Added: {role} -> {txt}")
    summary = conv.maybe_summarize(k=k, summary_words=60, keep_tail=2)
    if summary:
        print("\n>>> SUMMARY generated and older history replaced:\n", summary, "\n")
    conv.pretty_print()

# Demonstrate truncation by turns, chars, and words
print("Last 4 turns (truncation by turns):")
for m in conv.get_history_by_turns(4):
    print(m["role"].upper(), ":", m["content"])

print("\nHistory truncated to max 200 chars (from the end):")
for m in conv.get_history_by_chars(200):
    print(m["role"].upper(), ":", m["content"])

print("\nHistory truncated to max 50 words:")
for m in conv.get_history_by_words(50):
    print(m["role"].upper(), ":", m["content"])


  "ts": datetime.utcnow().isoformat()


Added: user -> Hi, I'm Vijay. I want to build a conversation summarizer.

--- Conversation (1 messages) ---
01. USER     : Hi, I'm Vijay. I want to build a conversation summarizer.
-----------------------------------------------

Added: assistant -> Great — what language and runtime are you using?

--- Conversation (2 messages) ---
01. USER     : Hi, I'm Vijay. I want to build a conversation summarizer.
02. ASSISTANT: Great — what language and runtime are you using?
-----------------------------------------------

Added: user -> Python on Colab for prototyping, then a VPS later.


  "ts": datetime.utcnow().isoformat()



>>> SUMMARY generated and older history replaced:
 - Vijay wants to build a conversation summarizer. 


--- Conversation (3 messages) ---
01. SYSTEM    [SUMMARY]: [SUMMARY] - Vijay wants to build a conversation summarizer.
02. ASSISTANT: Great — what language and runtime are you using?
03. USER     : Python on Colab for prototyping, then a VPS later.
-----------------------------------------------

Added: assistant -> We can summarize periodically and extract structured fields from chats.

--- Conversation (4 messages) ---
01. SYSTEM    [SUMMARY]: [SUMMARY] - Vijay wants to build a conversation summarizer.
02. ASSISTANT: Great — what language and runtime are you using?
03. USER     : Python on Colab for prototyping, then a VPS later.
04. ASSISTANT: We can summarize periodically and extract structured fields from chats.
-----------------------------------------------

Added: user -> Please ensure we preserve names, emails and action items.

--- Conversation (5 messages) ---
01. SYSTEM 

## Task 2: function schema + extraction method (supports mock fallback)

In [6]:
# JSON schema for extraction (using 5 fields)
function_schema = {
    "name": "extract_info",
    "description": "Extract name, email, phone, location and age from the message.",
    "parameters": {
        "type": "object",
        "properties": {
            "name": {"type": "string"},
            "email": {"type": "string", "format": "email"},
            "phone": {"type": "string"},
            "location": {"type": "string"},
            "age": {"type": "integer", "minimum": 0, "maximum": 120}
        },
        "required": []
    }
}

def extract_info_via_groq(text, function_schema, max_tokens=200):
    """
    Uses Groq (OpenAI-compatible) chat completion with a single function definition.
    If USE_MOCK True, returns heuristic extractions without network.
    Returns (parsed_dict, raw_response_or_mock_metadata).
    """
    if USE_MOCK:
        parsed = {}
        # email
        em = re.search(r'[\w\.-]+@[\w\.-]+\.\w+', text)
        if em: parsed["email"] = em.group(0)
        # phone (very permissive)
        ph = re.search(r'(\+?\d[\d\-\s]{6,}\d)', text)
        if ph: parsed["phone"] = ph.group(0)
        # age
        m_age = re.search(r'age\s*(?:is|:)?\s*(\d{1,3})', text, flags=re.IGNORECASE)
        if m_age:
            try:
                parsed["age"] = int(m_age.group(1))
            except: pass
        # name
        m_name = re.search(r"(?:I am|I'm|This is|My name is)\s+([A-Z][a-zA-Z]+(?:\s[A-Z][a-zA-Z]+)?)", text)
        if m_name: parsed["name"] = m_name.group(1)
        # location
        m_loc = re.search(r'(?:based in|live in|located in|from)\s+([A-Za-z\s]+)', text, flags=re.IGNORECASE)
        if m_loc: parsed["location"] = m_loc.group(1).strip().strip('.')
        return parsed, {"mock": True, "input": text}

    # Build chat + function calling body
    body = {
        "model": DEFAULT_MODEL,
        "messages": [
            {"role": "system", "content": "You are a strict extractor. Use the function to emit valid JSON matching the schema."},
            {"role": "user", "content": text}
        ],
        "functions": [function_schema],
        "function_call": {"name": function_schema["name"]},
        "max_tokens": max_tokens,
        "temperature": 0.0
    }
    resp = requests.post(GROQ_CHAT_COMPLETIONS, headers=HEADERS, json=body, timeout=30)
    resp.raise_for_status()
    data = resp.json()
    # Parse function_call arguments
    parsed = {}
    try:
        fc = data["choices"][0]["message"].get("function_call", {})
        args_str = fc.get("arguments")
        if args_str:
            parsed = json.loads(args_str)
        else:
            # fallback: maybe assistant content is JSON
            content = data["choices"][0]["message"].get("content", "")
            parsed = json.loads(content) if content.strip().startswith("{") else {}
    except Exception:
        parsed = {}
    return parsed, data

def validate_against_schema(obj, schema):
    try:
        validate(instance=obj, schema=schema)
        return True, None
    except ValidationError as e:
        return False, str(e)


##Task 2 demo: parse with3 sample chats and validate

In [7]:
sample_chats = [
    "Hi, I'm Rahul Sharma. My email is rahul.sharma@example.com. Phone: +91-9876543XXX live in Pune. I'm 23 years old.",
    "This is Neha — reach me at neha.work@gmail.com. Based in Bengaluru. Age: 28. Call me at 98765123XX.",
    "Hey! Name: Arjun Patel. Contact: arjun.patel@domain.co. Location - Mumbai. Age: 25. Mobile +91 91234 567XX."
]

for i, s in enumerate(sample_chats, 1):
    print(f"\n--- Sample {i} ---\n{s}\n")
    parsed, raw = extract_info_via_groq(s, function_schema)
    print("Parsed:", parsed)
    valid, err = validate_against_schema(parsed, function_schema["parameters"])
    if valid:
        print("Validation: PASSED ✅")
    else:
        print("Validation: FAILED ❌")
        print("Validation error:", err)
    if not USE_MOCK:
        # show minimal raw keys for debugging
        print("Raw response keys:", list(raw.keys()))



--- Sample 1 ---
Hi, I'm Rahul Sharma. My email is rahul.sharma@example.com. Phone: +91-9876543XXX live in Pune. I'm 23 years old.

Parsed: {'age': 23, 'email': 'rahul.sharma@example.com', 'location': 'Pune', 'name': 'Rahul Sharma', 'phone': '+91-9876543XXX'}
Validation: PASSED ✅
Raw response keys: ['id', 'object', 'created', 'model', 'choices', 'usage', 'usage_breakdown', 'system_fingerprint', 'x_groq', 'service_tier']

--- Sample 2 ---
This is Neha — reach me at neha.work@gmail.com. Based in Bengaluru. Age: 28. Call me at 98765123XX.

Parsed: {'age': 28, 'email': 'neha.work@gmail.com', 'location': 'Bengaluru', 'name': 'Neha', 'phone': '98765123XX'}
Validation: PASSED ✅
Raw response keys: ['id', 'object', 'created', 'model', 'choices', 'usage', 'usage_breakdown', 'system_fingerprint', 'x_groq', 'service_tier']

--- Sample 3 ---
Hey! Name: Arjun Patel. Contact: arjun.patel@domain.co. Location - Mumbai. Age: 25. Mobile +91 91234 567XX.

Parsed: {'age': 25, 'email': 'arjun.patel@domain.

# GitHub Info

This notebook implements both tasks of the Groq Internship Assignment:


*   Task 1: Conversation management with summarization  
*   Task 2: JSON schema classification & structured extraction



### Repo Setup
1. Created GitHub repo: https://github.com/vijay-sonavane9/groq-chat-summarizer
2. Added `.gitignore` to prevent committing API keys:
   ```text
   *.env
   api_key.txt
   .ipynb_checkpoints/
