In [2]:
#  To scrape content from a page
import os
import json
import requests
from pydantic import BaseModel
from typing import List
from groq import Groq

import os
import dotenv
dotenv.load_dotenv()

# Define our Pydantic models to structure the parsed output.
class TweetObject(BaseModel):
    Author: str         # e.g. @elonmusk
    Time: str           # e.g. 2h ago
    Content: str        # e.g. Tweet's textual content
    Likes: float        # e.g. 109000 (converted from "109K")
    Comments: float     # e.g. 3400 (converted from "3.4K")
    Reposts: float      # e.g. 19000 (converted from "19K")
    Views: float        # e.g. 22000000 (converted from "22M")

class TweetList(BaseModel):
    tweets: List[TweetObject]


def get_web_task_result() -> str:
    """
    Performs two requests:
    1. Gets the session id.
    2. Uses the session id to perform the web task.
    
    Returns the raw text from the web task response (the 'result' field).
    """
    # Step 1: POST to obtain the session ID.
    url_session = "https://api.anchorbrowser.io/api/sessions"
    payload_session = {"profile": {"name": "YOUR_SESSION_NAME"}}
    headers_session = {
        "anchor-api-key": os.getenv("ANCHOR_API_KEY"),
        "Content-Type": "application/json"
    }

    response_session = requests.post(url_session, json=payload_session, headers=headers_session)
    session_data = response_session.json()
    session_id = session_data["id"]

    # Step 2: POST to perform the web task using the obtained session ID.
    url_task = "https://connect.anchorbrowser.io/tools/perform-web-task"
    querystring = {
        "apiKey": os.getenv("ANCHOR_API_KEY"),
        "sessionId": session_id
    }
    payload_task = {
        "task": ("Click on the 'Top tab, and return the full content of the last 10 posts that you can see on the page. Return the content of the post, along with their number of likes, views and comments, and when they were published. Always expand the post to identify if there are subposts. If a post contains multiple subposts, you need to consolidate all of their content into one post. IMPORTANT: Ensure you follow the chronological order of the posts."),
        "url": "https://x.com/i/communities/1493446837214187523"
    }
    headers_task = {"Content-Type": "application/json"}
    response_task = requests.post(url_task, json=payload_task, headers=headers_task, params=querystring)
    response_json = response_task.json()
    
    # The web task response is expected to have a "result" field with the text.
    result_text = response_json.get("result", "")
    return result_text

def parse_tweets_with_llm(raw_text: str) -> TweetList:
    """
    Sends the raw result text to the GroqCloud LLM (llama-3.1-8b-instant) 
    to parse the tweets into JSON, following a strict schema.
    """
    prompt = (
        "You are a helpful assistant that extracts tweet information from the given text. The text below contains "
        "multiple posts with details (such as the author's name, time, content, likes, comments, reposts, and views). "
        "Extract the data and output it as valid JSON following the schema exactly. The JSON output must be an object with a "
        "single key 'tweets', which is a list of tweet objects. Each tweet object must have the following keys: "
        "'Author' (string), 'Time' (string), 'Content' (string), 'Likes' (number), 'Comments' (number), 'Reposts' (number), "
        "and 'Views' (number). Convert shorthand numbers (e.g. '109K' to 109000, '22M' to 22000000) when applicable. "
        "Do not output any explanations or additional text—only valid JSON exactly as specified. Use a low temperature (0.1) "
        "to ensure deterministic output.\n\n"
        "Text to parse:\n"
        f"{raw_text}"
    )
    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
    client = Groq(api_key=GROQ_API_KEY)
    messages = [
        {"role": "user", "content": prompt}
    ]
    chat_completion = client.chat.completions.create(
        messages=messages,
        model="llama-3.1-8b-instant",
        temperature=0.1
    )
    llm_output = chat_completion.choices[0].message.content

    # Debug: Print the raw output so you can inspect the JSON.
    print("DEBUG: Raw LLM output:")
    print(llm_output)
    
    # Optional: Remove markdown code fences if they exist.
    if llm_output.startswith("```") and llm_output.endswith("```"):
        llm_output = llm_output.strip("`").strip()
    
    # Alternatively, try extracting the JSON part directly:
    start = llm_output.find("{")
    end = llm_output.rfind("}") + 1
    if start != -1 and end != -1:
        llm_output = llm_output[start:end]
    
    # Parse LLM's output (which should be valid JSON matching our schema)
    parsed_json = json.loads(llm_output)
    tweet_list = TweetList(**parsed_json)
    return tweet_list

if __name__ == '__main__':
    # Step 1: Get the raw web task result text.
    raw_text = get_web_task_result()
    print("Raw response text from web task:")
    print(raw_text)
    
    # Step 2: Parse the raw text into structured JSON using the LLM chain.
    try:
        tweets = parse_tweets_with_llm(raw_text)
        print("\nParsed Tweet List (JSON):")
        # Updated the method call to work with Pydantic v2.
        print(tweets.model_dump_json(indent=2))
    except Exception as e:
        print("An error occurred while parsing the tweets:", e)

Raw response text from web task:
1. **Post by Eric Djavid**
   - Content: Are you proud of your SAAS? If yes, I'll try to roast it. Showcase it here ↓
   - Likes: 38
   - Views: 11,828
   - Comments: 57
   - Published: 12 hours ago

2. **Post by Hugo**
   - Content: What would you rather focus on? Build 5 products for a year or Build 1 product for 5 years
   - Likes: 59
   - Views: 7,400
   - Comments: 46
   - Published: 18 hours ago

3. **Post by Prajwal Tomar**
   - Content: Tried Windsurf and Cursor on the same project today, here’s what I found: Windsurf has better agentic capabilities for large-scale refactoring. It updated my entire codebase with a new design system effortlessly, while Cursor messed it up badly. Cursor is much better for UI
   - Likes: 427
   - Views: 46,235
   - Comments: 34
   - Published: Feb 4

4. **Post by Olga**
   - Content: Hustle culture is overrated. True progress comes from balance, not burnout. What’s your opinion?
   - Likes: 75
   - Views: 4,702
   