# Let's Try with omnivoreql python module

In [None]:
import openai
import json
import pandas as pd
import logging
import re
from omnivoreql import OmnivoreQL
from dotenv import load_dotenv
import os
from datetime import datetime

# Load environment variables from .env file
load_dotenv()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Omnivore and OpenAI API details
OMNIVORE_API_KEY = os.getenv("OMNIVORE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OMNIVORE_USERNAME = os.getenv("OMNIVORE_USERNAME")

# Validate API keys
if not OMNIVORE_API_KEY or not OPENAI_API_KEY or not OMNIVORE_USERNAME:
    logging.error("Missing API key(s). Ensure OMNIVORE_API_KEY, OPENAI_API_KEY, and OMNIVORE_USERNAME are set.")
    exit(1)

# Set OpenAI API key
openai.api_key = OPENAI_API_KEY

# Initialize Omnivore client
omnivore_client = OmnivoreQL(OMNIVORE_API_KEY)

def get_my_articles_with_label(label, filename="articles.json"):
    """Fetch articles with a specific label, caching results to a file."""
    if os.path.exists(filename):
        try:
            with open(filename, 'r') as file:
                return json.load(file)
        except Exception as e:
            logging.error(f"Error loading articles from file: {e}")

    try:
        response = omnivore_client.get_articles(
            format="markdown",
            limit=100,
            query=f"in:inbox AND label:{label}",
            include_content=False
        )
        articles = [item['node'] for item in response['search']['edges']]

        try:
            with open(filename, 'w') as file:
                json.dump(articles, file)
        except Exception as e:
            logging.error(f"Error writing articles to file: {e}")

        return articles
    except Exception as e:
        logging.error(f"Error fetching articles: {e}")
        return []
    
def openai_request(prompt, max_tokens=150):
    """Send a request to the OpenAI API."""
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ],
            max_tokens=max_tokens
        )
        return response.choices[0].message['content'].strip()
    except openai.OpenAIError as e:
        logging.error(f"OpenAI error: {e}")
        return "Error during OpenAI request"
    
def summarize_article(content):
    """Generate a summary for an article using the OpenAI API."""
    try:
        return openai_request(f"Summarize the following article in three sentences:\n\n{content}", max_tokens=150)
    except openai.Error as e:
        # Log OpenAI API specific errors
        logging.error(f"OpenAI API error: {e}")
    except Exception as e:
        # Log other exceptions
        logging.error(f"General error when calling OpenAI API: {e}")

def generate_tags(content, labels):
    """Generate tags for an article using the OpenAI API."""
    tags = "ai, ai-governance, ai-optimization, ai-reliability, ai-security, agi, anthropic, apple, automation, aws, azure, claude, cloud, cloud-computing, cyber-security, data-science, deep-learning, developer-experience, ethical-ai, enterprise-architecture, fin-ops, gcp, generative-ai, github, github-actions, github-repo, gpt-4, jupyter-notebooks, langchain, llm, machine-learning, nlp, openai, personal-development, projects, python, rag, responsible-ai, security"
    prompt = (
        f"Extract and process the tags from the following list: {labels}. "
        f"Remove all single quotes and brackets, resulting in a comma-separated list of tags that only contains letters, numbers, and dashes. "
        f"From this list, determine how many tags are present and subtract that number from five to determine how many additional tags are needed. "
        f"From the following list of available tags: {tags}, select additional tags that match the content of the article below, ensuring the total number of tags is five. "
        f"If there are not enough matching tags from the provided list, create new tags following these rules: "
        f"1. Use no more than three words, "
        f"2. Use only lowercase letters, "
        f"3. Replace spaces with dashes. "
        f"Return exactly five tags as a comma-separated list with no additional text or explanation. "
        f"Here is the article content:\n\n{content}"
    )
    try:
        return openai_request(prompt, max_tokens=150)
    except openai.Error as e:
        # Log OpenAI API specific errors
        logging.error(f"OpenAI API error: {e}")
    except Exception as e:
        # Log other exceptions
        logging.error(f"General error when calling OpenAI API: {e}")


def get_web_page_content(slug):
    """Retrieve the web page content for a given article slug."""
    try:
        response = omnivore_client.get_article(
            username=OMNIVORE_USERNAME,
            slug=slug,
            format="markdown"
        )
        return response['article']['article']['content']
    except Exception as e:
        logging.error(f"Error fetching article content: {e}")
        return ""
    
def process_articles(articles, exclude_label):
    """Process articles to generate summaries and tags."""
    processed_articles = []
    if os.path.exists('processed_articles.csv'):
        processed_articles = pd.read_csv('processed_articles.csv').to_dict('records')
    else:
        for article in articles:
            title = article.get("title", "Untitled")
            site_name = article.get("siteName", "Unknown")
            description = article.get("description", "")
            link = article.get("url", "")
            slug = article.get("slug", "")
            labels = [label['name'] for label in article['labels'] if label['name'] != exclude_label]
            content = get_web_page_content(slug)
            if content:
                logging.info(f"Processing article: {title}")
                summary = summarize_article(content)
                if "no content" in summary.lower():
                    tags = ['None']
                else:
                    tags = generate_tags(content, labels).split(",")
                    tags = [tag.strip() for tag in tags]
                processed_articles.append({
                    "title": title,
                    "tags": tags,
                    "site_name": site_name,
                    "description": description,
                    "link": link,
                    "slug": slug,
                    "content": content,
                    "summary": summary
                })
            else:
                logging.warning(f"Skipping article {title} due to missing content")
        pd.DataFrame(processed_articles).to_csv('processed_articles.csv', index=False)
    return processed_articles

def create_markdown_content(processed_articles):
    """Create markdown content from processed articles."""
    md_content = "# Articles Summary\n\n"
    for article in processed_articles:
        # Ensure article['tags'] is a list
        tags = article['tags'] if isinstance(article['tags'], list) else [article['tags']]
        
        # Prepend '#' to each tag and join them with a space
        formatted_tags = ', '.join([f"#{tag}" for tag in tags])
        md_content += (
            f"### [{article['title']}]({article['link']})\n\n"
            f"**Tags:** *{formatted_tags}*\n\n"
            f"**Site Name:** {article['site_name']}\n\n"
            f"**Omnivore Description:** {article['description']}\n\n"
            f"**ChatGPT Summary:** {article['summary']}\n\n"
        )
    return md_content

def write_to_markdown_with_grouping(processed_articles, blog_title, filename):
    """Write processed articles to a markdown file with front matter and tags."""
    unique_tags = sorted({tag for article in processed_articles for tag in article['tags'] if tag != 'None'})
    formatted_tags = ", ".join([f'"{tag.strip()}"' for tag in unique_tags])
    current_datetime = datetime.now().strftime("%Y-%m-%dT%H:%M:%S%z")
    blog_article_md = create_markdown_content(processed_articles)
    front_matter = f"---\ntitle: {blog_title}\ndate: {current_datetime}\ntags: [{formatted_tags}]\n---\n\n"
    blog_article_md = front_matter + blog_article_md
    
    return front_matter, blog_article_md

def extract_articles(content):
    """Extract articles from the given content."""
    try:
        articles = re.findall(
            r'### \[(.*?)\]\((.*?)\)\n\n\*\*Tags:\*\* \*(.*?)\*\n\n\*\*Site Name:\*\* (.*?)\n\n\*\*Omnivore Description:\*\* (.*?)\n\n\*\*ChatGPT Summary:\*\* (.*?)\n\n',
            content, re.DOTALL
        )
        return [{'title': title, 'link': link, 'tags': tags, 'site_name': site_name, 'description': description, 'summary': summary} for title, link, tags, site_name, description, summary in articles]
    except openai.Error as e:
        # Log OpenAI API specific errors
        logging.error(f"OpenAI API error: {e}")
    except Exception as e:
        # Log other exceptions
        logging.error(f"General error when calling OpenAI API: {e}")


def generate_categories(articles):
    """Generate categories for articles using OpenAI API."""
    prompts = [
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Group the following articles into 2-5 broad categories based on their tags and content. Provide category names and associate each article with a single category. Format the output as follows:\n\n- Category Name\n  - Article Title\n  - Article Title\n\nThe articles are: " + "\n".join([f"Title: {article['title']}, Tags: {article['tags']}" for article in articles])}
    ]
    try:
        response = openai.ChatCompletion.create(
            model="gpt-4o",
            messages=prompts,
            max_tokens=500
        )
        return response.choices[0].message['content'].strip()
    except openai.Error as e:
        # Log OpenAI API specific errors
        logging.error(f"OpenAI API error: {e}")
    except Exception as e:
        # Log other exceptions
        logging.error(f"General error when calling OpenAI API: {e}")


def parse_categories_response(response):
    """Parse the categories response from the OpenAI API."""
    categories = {}
    current_category = None
    for line in response.split("\n"):
        if line.startswith("- "):
            current_category = line[2:].strip()
            categories[current_category] = []
        elif line.startswith("  - ") and current_category:
            title = line[4:].strip()
            categories[current_category].append(title)
    return categories

def map_articles_to_categories(articles, categories):
    """Map articles to categories and extract unique tags."""
    categorized_articles = {category: {'articles': [], 'tags': set()} for category in categories.keys()}
    for article in articles:
        for category, titles in categories.items():
            if article['title'] in titles:
                categorized_articles[category]['articles'].append(article)
                for tag in article['tags'].split(','):
                    categorized_articles[category]['tags'].add(tag.strip())
                break
    return categorized_articles

def create_table_of_contents(categorized_articles):
    """Create a table of contents for the categorized articles."""
    toc = "# Table of Contents\n\n"
    for category in categorized_articles.keys():
        toc += f"- [{category}](#{category.lower().replace(' ', '-')})\n"
        for article in categorized_articles[category]['articles']:
            toc += f"  - [{article['title']}]({article['link']})\n"
    # toc += "\n-----\n\n"
    return toc

def create_categorized_markdown_content(categorized_articles):
    """Create categorized markdown content."""
    md_content = ""
    for category, data in categorized_articles.items():
        tags_list = ', '.join(sorted(data['tags']))
        md_content += f"-----\n\n# {category}\n\n**Category Tags:** {tags_list}\n\n"
        for article in data['articles']:
            md_content += (
                f"### [{article['title']}]({article['link']})\n\n"
                f"**Tags:** *{article['tags']}*\n\n"
                f"**Site Name:** {article['site_name']}\n\n"
                f"**Omnivore Description:** {article['description']}\n\n"
                f"**ChatGPT Summary:** {article['summary']}\n\n"
            )
        md_content += "\n"
    return md_content

def main(blog_title, label, filename):
    """Main function to process and categorize articles, then generate the markdown file."""
    articles = get_my_articles_with_label(label)
    if articles:
        processed_articles = process_articles(articles, label)
        front_matter, blog_article_md = write_to_markdown_with_grouping(processed_articles, blog_title, filename)
    else:
        logging.info("No articles found with the specified label.")
        return

    extracted_articles = extract_articles(blog_article_md)

    # Generate categories and categorize articles
    categories_response = generate_categories(extracted_articles)
    categories = parse_categories_response(categories_response)
    categorized_articles = map_articles_to_categories(extracted_articles, categories)

    # Create table of contents
    toc = create_table_of_contents(categorized_articles)

    # Create markdown content
    articles_content = create_categorized_markdown_content(categorized_articles)

    # Combine TOC and articles content
    final_markdown_content = front_matter + toc + articles_content

    # Write the final markdown content to a file
    with open(filename, 'w') as file:
        file.write(final_markdown_content)

if __name__ == "__main__":
    main(blog_title="My Blog Title", label="label", filename="index.md")
