In [3]:
import anthropic
import datetime
import requests
import json
import os
import time
import csv


from datetime import datetime
from dotenv import load_dotenv
from requests.exceptions import RequestException
from bs4 import BeautifulSoup
from IPython.display import display, HTML
from IPython.display import Markdown
from anthropic import APIConnectionError, APIStatusError, RateLimitError
load_dotenv()
my_api_key=os.getenv("ANTHROPIC_API_KEY") 

client = anthropic.Client()
MODEL_NAME = "claude-3-sonnet-20240229" # as an alternative, try claude-3-haiku-20240307 or claude-3-opus-20240229

In [4]:
def fetch_article_content(url):
    """
    Fetches and extracts the main textual content from a web page.

    This function performs the following steps:
    1. Sends a GET request to the specified URL
    2. Parses the HTML content using BeautifulSoup
    3. Removes script and style elements
    4. Extracts and cleans the text content
    5. Measures and prints the duration of fetching and scraping processes.

    Args:
        url (str): The URL of the web page to fetch and extract content from.

    Returns:
        str: The extracted and cleaned text content from the web page.
        None: If an error occurs during the process.

    Raises:
        requests.exceptions.HTTPError: If an HTTP error occurs.
        requests.exceptions.ConnectionError: If a connection error occurs.
        requests.exceptions.Timeout: If the request times out.
        requests.exceptions.RequestException: For any other request-related errors.
        Exception: For any unexpected errors during the process.
    """
    
    try:
        start_time = time.time()
        response = requests.get(url)
        response.raise_for_status() #Raises an HTTPError for bad responses
        end_time= time.time()
        fetch_duration=end_time-start_time
        print(f"Fetch duration {fetch_duration:.2f}") # seconds
        
        soup = BeautifulSoup(response.content, 'html.parser')
        start_time = time.time()
        
        # Remove script and style elements
        for script in soup(["script", "style"]):
            script.decompose()
        
        # Get page text
        text = soup.get_text()
       
        
        # Break into lines and remove leading and trailing space on each
        lines = (line.strip() for line in text.splitlines())
        # Break multi-headlines into a line each
        chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
        # Drop blank lines
        text = '\n'.join(chunk for chunk in chunks if chunk)
        end_time= time.time()
    
        scrap_duration=end_time-start_time
        print(f"Scrap duration {scrap_duration:.2f}")
      
        return text
    except requests.exceptions.HTTPError as http_err:
        print(f"HTTP error occurred: {http_err}")
    except requests.exceptions.ConnectionError as conn_err:
        print(f"Connection error occurred: {conn_err}")
    except requests.exceptions.Timeout as timeout_err:
        print(f"Timeout error occurred: {timeout_err}")
    except requests.exceptions.RequestException as req_err:
        print(f"An error occurred while fetching the web page: {req_err}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")
    return None

In [5]:
#chiamata al llm per generare un sommario del post o del contenuto di testo
def generate_summary(web_page, url):
    """
    Generate a concise summary and information extraction of the given web page content.

    This function takes the content of a web page and its URL, sends a request to Claude
    to generate a summary, and returns the summarized content in a structured format.

    Args:
        web_page (str): The content of the web page to be summarized.
        url (str): The URL of the web page being summarized.

    Returns:
        str: A JSON-formatted string containing the summary of the web page content.
             The summary includes the URL, author name, job title, key points, and
             any resources mentioned in the content.
        None: If an error occurs during the summarization process.

    Raises:
        APIConnectionError: If there's an error connecting to the LLM API.
        APIStatusError: If the API returns an error status.
        RateLimitError: If the API rate limit is exceeded.
        ValueError: If an empty response is received from the LLM.
        Exception: For any other unexpected errors during the process.
    """
    try:
        summary_template = f"""summary:{{
        'url': {url!r},
        'author_name': {{author}},
        'job_title': {{job}},
        'key_points': {{points}},
        'resources': {{res}}
        }}"""
         
        system_prompt = "You are a highly skilled content summarizer with expertise in extracting key information from various types of text. \
                         Your summaries are concise, accurate, and capture the essence of the original content. \
                         Focus on the most important points and maintain objectivity in your summaries."
        
        prompt = f"""<content>{web_page}</content>
                Please produce a concise summary in bullet points of the above content.
                <Instructions>
                    Read carefully through the content and highlight key points. Before generating the output, please analyze the content in <thinking> tags.
                    This is the expected output:
                    <author_name></author_name>
                    <job_title>If available, else insert N/A</job_title>
                    <key_points>
                      - Point 1
                      - Point 2
                      - ...
                    </key_points>
                     <resources>
                     Any pdf document, video link, book reference or website link extracted from content
                    </resources>
                </Instructions>
                <SummaryOutput>

                    Provide the summary output in JSON format, drop the preamble, exclude the "```json" declaration and adhere to the following example:
                      {summary_template}
                </SummaryOutput>
              """

        messages = [
            {"role": "user", "content": prompt}
        ]
        
        start_time = time.time()
        dt_object = datetime.fromtimestamp(start_time)
        format1 = dt_object.strftime("%d-%m-%Y %H:%M:%S")
        print(f"Start generating summary with Claude at {format1}")
        
        response = client.messages.create(
            model= MODEL_NAME,
            max_tokens=1024,
            system=system_prompt,
            messages=messages
        )
        end_time = time.time()
        dt_object = datetime.fromtimestamp(end_time)
        duration = end_time - start_time
        format1 = dt_object.strftime("%d-%m-%Y %H:%M:%S")
        print(f"End summarizing content at {format1}. It took {duration:.2f} seconds\n\n")
        
        if response and response.content:
            summary = response.content[0].text
            #print(summary)
            print("---------------------")
            return summary
        else:
            raise ValueError("Empty response received from Claude")

    except APIConnectionError as e:
        print(f"Error connecting to the API: {e}")
    except APIStatusError as e:
        print(f"API returned an error status: {e}")
    except RateLimitError as e:
        print(f"Rate limit exceeded: {e}")
    except ValueError as e:
        print(f"Value error occurred: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None  # Return None if any error occurs

## Command-line interface

In [19]:
def main():
    """
     Main function to run the content summarization program.

    This function implements a simple command-line interface that allows users
    to enter URLs of web pages they want to summarize. It continues to prompt
    for URLs until the user types 'exit' to quit the program.

    The function performs the following steps:
    1. Prompts the user to enter a URL or 'exit' to quit.
    2. If the user enters 'exit', the program terminates.
    3. If a URL is entered, it fetches the content of the web page.
    4. If the content is successfully fetched, it generates a summary.
    5. Prints the generated summary.
    6. Repeats the process by prompting for another URL.
    """

    user_input=input("Enter the URL of the web page to summarize. Type 'exit' to quit:")
   
    
    if user_input.lower() == "exit":
        print("Goodbye!")
        return

    web_page= fetch_article_content(user_input) 
    if web_page:
        summary = generate_summary(web_page, user_input)
        print(summary)
    
    user_input=input("Enter the URL of the web page to summarize. Type 'exit' to quit")
  

In [21]:
main()

Enter the URL of the web page to summarize. Type 'exit' to quit: EXIT


Goodbye!


In [22]:
import os
def read_csv(file):
    """
    Read and parse a CSV file, returning its contents as a list of dictionaries.

    Args:
        file (str): The path to the CSV file to be read.

    Returns:
        list of dict: A list where each item is a dictionary representing a row in the CSV file,
                      with keys as column names and values as cell contents.
        None: If any error occurs during the file reading process.

    Raises:
        FileNotFoundError: If the specified file does not exist.
        PermissionError: If there's no permission to read the file.
        ValueError: If the file is empty or contains no columns.
        csv.Error: For issues related to CSV parsing.
        UnicodeDecodeError: If the file is not in UTF-8 encoding.
    """
    print(f"reading csv named {file}")
    try:
        # Check if the file exists
        if not os.path.exists(file):
            raise FileNotFoundError(f"The file {file} does not exist.")

        # Check if the file is empty
        if os.stat(file).st_size == 0:
            raise ValueError(f"The file {file} is empty.")

        with open(file, "r", newline='', encoding='utf-8') as f:
            csv_reader = csv.DictReader(f)
            
            # Check if the CSV file has any columns
            if not csv_reader.fieldnames:
                raise ValueError(f"The CSV file {file} does not contain any columns.")

            data = []
            for row in csv_reader:
                data.append(row)

            # Check if any data was read
            if not data:
                print(f"Warning: No data was read from {file}. The file might be empty after the header.")

            return data

    except FileNotFoundError as e:
        print(f"File not found error: {e}")
    except PermissionError:
        print(f"Permission denied: Unable to read the file {file}")
    except csv.Error as e:
        print(f"CSV reading error: {e}")
    except UnicodeDecodeError:
        print(f"Encoding error: The file {file} is not in UTF-8 encoding.")
    except ValueError as e:
        print(f"Value error: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None  # Return None if any error occurs

In [23]:
def classify_saved_items(file_path):
    """
    Reads URLs from a CSV file and returns a list of summaries (string).

    This function is designed to automate calls from a CSV file
    containing Linkedin saved items and extracts the URLs for further processing.

    Args:
        file_path (str): The path to the CSV file containing saved items.

    Returns:
        list: A list of summaries created via Claude
        None: If an error occurs during the process.

    Raises:
        FileNotFoundError: If the specified file is not found.
        csv.Error: If there's an error reading the CSV file.
        KeyError: If the 'savedItem' column is missing from the CSV.
        Exception: For any unexpected errors during the process.
    Note:
        delays execution for 5 seconds to prevent error rate limit
    """
    links = []    # Empty list to store URLs read from the file
    web_pages = []  # Empty list to store scraped content to be passed to the LLM
    summaries = []  # Empty list to store summary output from the LLM call
    try:
        # Check if file_path is not empty
        if not file_path:
            raise ValueError("File path cannot be empty.")

        csv_data = read_csv(file_path)
        if csv_data is None:
            print(f"Failed to read CSV data from {file_path}")
            return None

        print(f"Successfully read {len(csv_data)} rows from {file_path}")

        # Process the data
        for row in csv_data:
            try:
                saved_item = row['savedItem']
                print(f"Processing saved item: {saved_item}")
                links.append(saved_item)
            except KeyError:
                print("Error: 'savedItem' column not found in CSV.")
                return None

        print(f"Extracted {len(links)} links from the CSV.")
        #web scraping
        for url in links:
                web_page=fetch_article_content(url)
                print(web_page[:100])
                #API call to Claude
                summary= generate_summary(web_page, url)
                
                summaries.append(summary)
                time.sleep(5) #keep quiet for 5 seconds
                
        return summaries

    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except csv.Error as e:
        print(f"Error reading CSV file: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

    return None  # Return None if any error occurs

In [24]:
#start automation
myurls=classify_saved_items('5_Saved_Items.csv') # assuming the file is in the current folder

reading csv named ../5_Saved_Items.csv
Successfully read 5 rows from ../5_Saved_Items.csv
Processing saved item: https://www.linkedin.com/feed/update/urn:li:activity:6953785620804558848
Processing saved item: https://www.linkedin.com/feed/update/urn:li:activity:6950242812156137472
Processing saved item: https://www.linkedin.com/feed/update/urn:li:activity:6947579641096261632
Processing saved item: https://www.linkedin.com/feed/update/urn:li:activity:6947916350719725568
Processing saved item: https://www.linkedin.com/feed/update/urn:li:activity:6947208676998844417
Extracted 5 links from the CSV.
Fetch duration 2.17
Scrap duration 0.02
Maurizio Savioli on LinkedIn: #leadership #leadership #management #hr #humanresources #changemanagem
Start generating summary with Claude at 06-09-2024 14:07:07
End summarizing content at 06-09-2024 14:07:16. It took 8.91 seconds


---------------------
Fetch duration 1.09
Scrap duration 0.02
Elaine A. on LinkedIn: #cxd #conversationdesign #voicefirst #des

In [25]:
#PRINT RESULTS
for url in myurls:
    print(f"Sommario: {url} \n")
    print("----------------")

Sommario: <thinking>
The content appears to be a LinkedIn post by Maurizio Savioli discussing leadership models and styles. Some key points I've identified:

- Outdated leadership models are not keeping pace with rapid changes in organizations
- The author references the book "The Leader Ship" by L. David Marquet, which advocates a "leader-leader" distributed leadership style rather than traditional top-down authoritarian leadership
- The author shares a quote about leadership being about communicating people's value and potential to inspire them, from the book's preface by Stephen M.R. Covey
- The author finds value in learning from the book's lessons, even outside of a naval submarine context where it is based

Overall, the post seems to promote a more collaborative, empowering leadership approach versus command-and-control styles.
</thinking>

<SummaryOutput>
summary:{
  'url': 'https://www.linkedin.com/feed/update/urn:li:activity:6953785620804558848',
  'author_name': 'Maurizio Sav