In [1]:
# Date: Dec 24
# Note: Using Gem (in place of OpenAI), followed plan...
# so can scape websites and produce a brochure based on that site. 
# Note: The notebook can be split in 1/2 (via a header)
# The 1st 1/2 is Ed's set up, etc.
# The 2nd half is just the code need to get the result.
# Pretty cool

# ----- (My project)
# Date: 09.01.25
# Plan: Make a Gradio UI, that lets you pick a job on seek.com, then scape key words and come up with a 
# plan on how to land jobs of the type selected.

# Date: 11.01.25
# Note: I created a pull request by adding a cleaned up version of my project to Ed's repo.
# That cleaned up file is in Week2/community-contributions with file name: 'day5_Careerhelper'

# A full business solution

## Now we will take our project from Day 1 to the next level

### BUSINESS CHALLENGE:

Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits.

We will be provided a company name and their primary website.

See the end of this notebook for examples of real-world business applications.

And remember: I'm always available if you have problems or ideas! Please do reach out.

In [169]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
#from openai import OpenAI

In [None]:
# Initialize and constants

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
# MODEL = 'gpt-4o-mini'
# openai = OpenAI()

# ----- Replacing with Gem code -----
import os
import google.generativeai as genai

genai.configure(api_key= api_key)

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
  #"response_mime_type": "application/json",

}

model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)

chat_session = model.start_chat(
  history=[  ])

message = "this is a test only"
response = chat_session.send_message(message)
print(response.text)

In [171]:
# # Experimenting to see if can get markdown output
# import os
# import google.generativeai as genai

# genai.configure(api_key=api_key)

# # Create the model
# generation_config = {
#   "temperature": 1,
#   "top_p": 0.95,
#   "top_k": 40,
#   "max_output_tokens": 8192,
#   "response_mime_type": "text/plain"
#   #"response_mime_type": "application/json",
# }

# model = genai.GenerativeModel(model_name="gemini-1.5-flash", generation_config=generation_config)

# chat_session = model.start_chat(history=[])

# message = "how do i make a carrot cake - output as markdown"
# response = chat_session.send_message(message)

# # Print the response as Markdown
# markdown_output = f"```\n{response.text}\n```"  # Wrap in code block for Markdown formatting
# print(markdown_output)


In [172]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [None]:
ed = Website("https://edwarddonner.com")
ed.links

## First step: Have GPT-4o-mini figure out which links are relevant

### Use a call to gpt-4o-mini to read the links on a webpage, and respond in structured JSON.  
It should decide which links are relevant, and replace relative links such as "/about" with "https://company.com/about".  
We will use "one shot prompting" in which we provide an example of how it should respond in the prompt.

This is an excellent use case for an LLM, because it requires nuanced understanding. Imagine trying to code this without LLMs by parsing and analyzing the webpage - it would be very hard!

Sidenote: there is a more advanced technique called "Structured Outputs" in which we require the model to respond according to a spec. We cover this technique in Week 8 during our autonomous Agentic AI project.

In [174]:
link_system_prompt = """You are provided with a list of links found on a webpage. 
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"""

link_system_prompt += """The quality of your answer will be judged by the following criteria;
                        - Your output will be valid dictionary, with the expected structure.
                        - Your output will NOT be 'Type of links: <class 'str'>'
                        - Your output will NOT include any leading content (i.e. "```json")
                        - Your output will NOT include any trailing content (i.e. "```")
                        - If the your output can not be parsed as valid json without error then that is a failure on your part \
                        - Your output will appear in the below format, any addtional content (as described above) is
                          strictly forbidden 
                            {
                                "links": [
                                    {"type": "about page", "url": "https://full.url/goes/here/about"},
                                    {"type": "careers page": "url": "https://another.full.url/careers"}
                                ]
                            }
                        """


In [None]:
print(link_system_prompt)

In [176]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [None]:
print(get_links_user_prompt(ed))

In [178]:
# def get_links(url):
#     website = Website(url)
#     response = openai.chat.completions.create(
#         model=MODEL,
#         messages=[
#             {"role": "system", "content": link_system_prompt},
#             {"role": "user", "content": get_links_user_prompt(website)}
#       ],
#         response_format={"type": "json_object"}
#     )
#     result = response.choices[0].message.content
#     return json.loads(result)


# ----- trying to replace above with Gem code -----
def get_links(url):
    website = Website(url)
    # response = openai.chat.completions.create(
    #     model=MODEL,
    #     messages=[
    #         {"role": "system", "content": link_system_prompt},
    #         {"role": "user", "content": get_links_user_prompt(website)}
    #   ],
    #     response_format={"type": "json_object"}

    response = chat_session.send_message(link_system_prompt + get_links_user_prompt(website))
    #print(response.text)
    response_format={"type": "json_object"}


    #)
    #result = response.choices[0].message.content
    result = response.text
    #return json.loads(result)
    # return result
    # print(result)

    # # ------ Adding the clean up -----
    # # -- This is the functtion to clean up
    import ast
    import re

    def extract_python_code(code_str):
        # Remove the markdown code block markers (```)
        code_str = re.sub(r'```json|```', '', code_str)

        # Remove comments from the string
        code_str = re.sub(r'#.*', '', code_str)

        # Attempt to parse the remaining code to ensure it's valid Python
        try:
            # Use the ast.parse function to check if the code is valid Python syntax
            ast.parse(code_str)
            return code_str.strip()  # Return the code if it's valid
        except SyntaxError:
            return ""  # Return an empty string if the code is invalid
        
    #print(extract_python_code(result))  
    result = extract_python_code(result)
    return result
    print(result)

#get_links("https://huggingface.co")
get_links("https://edwarddonner.com")
qa = get_links("https://edwarddonner.com")

# -----
# response = chat_session.send_message(message)
# print(response.text)


In [None]:
# Check if stgring or dict, and convert to dict if poss
import json

# Example variable that could be a string or dictionary
variable = qa

# Check if the variable is a string
if isinstance(variable, str):
    try:
        # Attempt to convert the string to a dictionary
        variable = json.loads(variable)
        print("Successfully converted string to dictionary:", variable)
    except json.JSONDecodeError:
        print("The string is not in a valid JSON format.")
elif isinstance(variable, dict):
    print("The variable is already a dictionary:", variable)
else:
    print("The variable is neither a string nor a dictionary.")
    


In [180]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

#huggingface = Website("https://huggingface.co")
ed = Website("https://edwarddonner.com")
#huggingface.links

In [181]:
# get_links("https://huggingface.co")
# tst = get_links("https://huggingface.co")

get_links("https://edwarddonner.com")
tst = get_links("https://edwarddonner.com")

In [None]:
print(tst)

In [183]:
# # -- This is the functtion to clean up
# import ast
# import re

# def extract_python_code(code_str):
#     # Remove the markdown code block markers (```)
#     code_str = re.sub(r'```json|```', '', code_str)

#     # Remove comments from the string
#     code_str = re.sub(r'#.*', '', code_str)

#     # Attempt to parse the remaining code to ensure it's valid Python
#     try:
#         # Use the ast.parse function to check if the code is valid Python syntax
#         ast.parse(code_str)
#         return code_str.strip()  # Return the code if it's valid
#     except SyntaxError:
#         return ""  # Return an empty string if the code is invalid
    
# print(extract_python_code(tst))  
# d = extract_python_code(tst)
# print(d)



## Second step: make the brochure!

Assemble all the details into another prompt to GPT4-o

In [184]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)


# ---------------- Make sure link is dict ----------------
    # Check if stgring or dict, and convert to dict if poss
    import json

    # Example variable that could be a string or dictionary
    variable = qa

    # Check if the variable is a string
    if isinstance(variable, str):
        try:
            # Attempt to convert the string to a dictionary
            variable = json.loads(variable)
            print("Successfully converted string to dictionary:", variable)
        except json.JSONDecodeError:
            print("The string is not in a valid JSON format.")
    elif isinstance(variable, dict):
        print("The variable is already a dictionary:", variable)
    else:
        print("The variable is neither a string nor a dictionary.")
    links = variable

# ---------------- Make sure link is dict ----------------


    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [None]:
#print(get_all_details("https://huggingface.co"))
print(get_all_details("https://edwarddonner.com"))


In [187]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [188]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
    user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [None]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

In [190]:
# -- default --
# def create_brochure(company_name, url):
#     response = openai.chat.completions.create(
#         model=MODEL,
#         messages=[
#             {"role": "system", "content": system_prompt},
#             {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
#           ],
#     )
#     result = response.choices[0].message.content
#     display(Markdown(result))



# -- replacing with gemini -- 
def create_brochure(company_name, url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(company_name, url))
    result = response.text
    display(Markdown(result))
   

In [None]:
create_brochure("HuggingFace", "https://huggingface.com")

## Finally - a minor improvement

With a small adjustment, we can change this so that the results stream back from OpenAI,
with the familiar typewriter animation

In [None]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")

In [None]:
# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:

stream_brochure("HuggingFace", "https://huggingface.co")

<table style="margin: 0; text-align: left;">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../business.jpg" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#181;">Business applications</h2>
            <span style="color:#181;">In this exercise we extended the Day 1 code to make multiple LLM calls, and generate a document.

This is perhaps the first example of Agentic AI design patterns, as we combined multiple calls to LLMs. This will feature more in Week 2, and then we will return to Agentic AI in a big way in Week 8 when we build a fully autonomous Agent solution.

Generating content in this way is one of the very most common Use Cases. As with summarization, this can be applied to any business vertical. Write marketing content, generate a product tutorial from a spec, create personalized email content, and so much more. Explore how you can apply content generation to your business, and try making yourself a proof-of-concept prototype.</span>
        </td>
    </tr>
</table>

<table style="margin: 0; text-align: left;">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../important.jpg" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#900;">Before you move to Week 2 (which is tons of fun)</h2>
            <span style="color:#900;">Please see the week1 EXERCISE notebook for your challenge for the end of week 1. This will give you some essential practice working with Frontier APIs, and prepare you well for Week 2.</span>
        </td>
    </tr>
</table>

<table style="margin: 0; text-align: left;">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../resources.jpg" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#f71;">A reminder on 2 useful resources</h2>
            <span style="color:#f71;">1. The resources for the course are available <a href="https://edwarddonner.com/2024/11/13/llm-engineering-resources/">here.</a><br/>
            2. I'm on LinkedIn <a href="https://www.linkedin.com/in/eddonner/">here</a> and I love connecting with people taking the course!
            </span>
        </td>
    </tr>
</table>

<table style="margin: 0; text-align: left;">
    <tr>
        <td style="width: 150px; height: 150px; vertical-align: middle;">
            <img src="../thankyou.jpg" width="150" height="150" style="display: block;" />
        </td>
        <td>
            <h2 style="color:#090;">Finally! I have a special request for you</h2>
            <span style="color:#090;">
                My editor tells me that it makes a MASSIVE difference when students rate this course on Udemy - it's one of the main ways that Udemy decides whether to show it to others. If you're able to take a minute to rate this, I'd be so very grateful! And regardless - always please reach out to me at ed@edwarddonner.com if I can help at any point.
            </span>
        </td>
    </tr>
</table>

# t1s: just the key parts below for ease of reference:

In [None]:
# 

In [None]:
# Code works, but could refine, esp looking at the 'qa' part 

In [1]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
#from openai import OpenAI

In [None]:
# Initialize and constants

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
# MODEL = 'gpt-4o-mini'
# openai = OpenAI()

# ----- Replacing with Gem code -----
import os
import google.generativeai as genai

genai.configure(api_key= api_key)

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
  #"response_mime_type": "application/json",

}

model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)

chat_session = model.start_chat(
  history=[  ])

message = "this is a test only"
response = chat_session.send_message(message)
print(response.text)

In [3]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [4]:
link_system_prompt = """You are provided with a list of links found on a webpage. 
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"""

link_system_prompt += """The quality of your answer will be judged by the following criteria;
                        - Your output will be valid dictionary, with the expected structure.
                        - Your output will NOT be 'Type of links: <class 'str'>'
                        - Your output will NOT include any leading content (i.e. "```json")
                        - Your output will NOT include any trailing content (i.e. "```")
                        - If the your output can not be parsed as valid json without error then that is a failure on your part \
                        - Your output will appear in the below format, any addtional content (as described above) is
                          strictly forbidden 
                            {
                                "links": [
                                    {"type": "about page", "url": "https://full.url/goes/here/about"},
                                    {"type": "careers page": "url": "https://another.full.url/careers"}
                                ]
                            }
                        """

In [5]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [6]:
# def get_links(url):
#     website = Website(url)
#     response = openai.chat.completions.create(
#         model=MODEL,
#         messages=[
#             {"role": "system", "content": link_system_prompt},
#             {"role": "user", "content": get_links_user_prompt(website)}
#       ],
#         response_format={"type": "json_object"}
#     )
#     result = response.choices[0].message.content
#     return json.loads(result)


# ----- trying to replace above with Gem code -----
def get_links(url):
    website = Website(url)
    response = chat_session.send_message(link_system_prompt + get_links_user_prompt(website))
    response_format={"type": "json_object"}
    result = response.text

    # # ------ Adding the clean up -----
    # # -- This is the functtion to clean up
    import ast
    import re
    def extract_python_code(code_str):
        # Remove the markdown code block markers (```)
        code_str = re.sub(r'```json|```', '', code_str)

        # Remove comments from the string
        code_str = re.sub(r'#.*', '', code_str)

        # Attempt to parse the remaining code to ensure it's valid Python
        try:
            # Use the ast.parse function to check if the code is valid Python syntax
            ast.parse(code_str)
            return code_str.strip()  # Return the code if it's valid
        except SyntaxError:
            return ""  # Return an empty string if the code is invalid
        
    #print(extract_python_code(result))  
    result = extract_python_code(result)
    return result
    #print(result)

# #get_links("https://huggingface.co")
# get_links("https://edwarddonner.com")
qa = get_links("https://edwarddonner.com")

# -----
# response = chat_session.send_message(message)
# print(response.text)


In [7]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)


# ---------------- Make sure link is dict ----------------
    # Check if stgring or dict, and convert to dict if poss
    import json

    # Example variable that could be a string or dictionary
    variable = qa

    # Check if the variable is a string
    if isinstance(variable, str):
        try:
            # Attempt to convert the string to a dictionary
            variable = json.loads(variable)
            print("Successfully converted string to dictionary:", variable)
        except json.JSONDecodeError:
            print("The string is not in a valid JSON format.")
    elif isinstance(variable, dict):
        print("The variable is already a dictionary:", variable)
    else:
        print("The variable is neither a string nor a dictionary.")
    links = variable

# ---------------- Make sure link is dict ----------------


    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [8]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [9]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
    user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [10]:
# -- replacing with gemini -- 
def create_brochure(company_name, url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(company_name, url))
    result = response.text
    display(Markdown(result))
   

In [None]:
#create_brochure("HuggingFace", "https://huggingface.com")
#create_brochure("Nebula", "https://edwarddonner.com")
#create_brochure("News", "https://www.news.com.au/")
create_brochure("News", "https://www.bendigobank.com.au/")


# My project

## WIP:

In [25]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display

import gradio as gr
#from openai import OpenAI

In [26]:
# Initialize and constants

load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
# MODEL = 'gpt-4o-mini'
# openai = OpenAI()

# ----- Replacing with Gem code -----
import os
import google.generativeai as genai

genai.configure(api_key= api_key)

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
  #"response_mime_type": "application/json",

}

model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)

chat_session = model.start_chat(
  history=[  ])

message = "this is a test only"
response = chat_session.send_message(message)
print(response.text)

There might be a problem with your API key? Please visit the troubleshooting notebook!
Understood.  I'm ready for your test. Please proceed.



In [27]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [28]:
link_system_prompt = """You are provided with a list of links found on a webpage. 
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"""

link_system_prompt += """The quality of your answer will be judged by the following criteria;
                        - Your output will be valid dictionary, with the expected structure.
                        - Your output will NOT be 'Type of links: <class 'str'>'
                        - Your output will NOT include any leading content (i.e. "```json")
                        - Your output will NOT include any trailing content (i.e. "```")
                        - If the your output can not be parsed as valid json without error then that is a failure on your part \
                        - Your output will appear in the below format, any addtional content (as described above) is
                          strictly forbidden 
                            {
                                "links": [
                                    {"type": "about page", "url": "https://full.url/goes/here/about"},
                                    {"type": "careers page": "url": "https://another.full.url/careers"}
                                ]
                            }
                        """

In [29]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

In [30]:
def get_links(url):
    website = Website(url)
    response = chat_session.send_message(link_system_prompt + get_links_user_prompt(website))
    response_format={"type": "json_object"}
    result = response.text

    # # ------ Adding the clean up -----
    # # -- This is the functtion to clean up
    import ast
    import re
    def extract_python_code(code_str):
        # Remove the markdown code block markers (```)
        code_str = re.sub(r'```json|```', '', code_str)

        # Remove comments from the string
        code_str = re.sub(r'#.*', '', code_str)

        # Attempt to parse the remaining code to ensure it's valid Python
        try:
            # Use the ast.parse function to check if the code is valid Python syntax
            ast.parse(code_str)
            return code_str.strip()  # Return the code if it's valid
        except SyntaxError:
            return ""  # Return an empty string if the code is invalid
        
    #print(extract_python_code(result))  
    result = extract_python_code(result)
    return result
    #print(result)

# #get_links("https://huggingface.co")
# get_links("https://edwarddonner.com")
#qa = get_links("https://edwarddonner.com")


In [31]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)


# ---------------- Make sure link is dict ----------------
    # Check if stgring or dict, and convert to dict if poss
    import json

    # Example variable that could be a string or dictionary
    variable = get_links(url)

    # Check if the variable is a string
    if isinstance(variable, str):
        try:
            # Attempt to convert the string to a dictionary
            variable = json.loads(variable)
            print("Successfully converted string to dictionary:", variable)
        except json.JSONDecodeError:
            print("The string is not in a valid JSON format.")
    elif isinstance(variable, dict):
        print("The variable is already a dictionary:", variable)
    else:
        print("The variable is neither a string nor a dictionary.")
    links = variable

# ---------------- Make sure link is dict ----------------


    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [32]:
# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short brochure about the company for prospective customers, investors and recruits. \
# Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
# Include details of company culture, customers and careers/jobs if you have the information."

system_prompt = "You are an experience recrutiment and talent management assistant, who will be provided a list of roles on offer.\
You will display those roles along with a high level summary of the key steps you suggest to land those roles. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include suggested next steps on how to successfully apply for and land each of these jobs."


# def get_brochure_user_prompt(company_name, url):
#     user_prompt = f"You are looking at a company called: {company_name}\n"
#     user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
#     user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
#     user_prompt += get_all_details(url)
#     user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
#     return user_prompt

# # -- replacing with gemini -- 
# def create_brochure(company_name, url):
#     response = chat_session.send_message(system_prompt + get_brochure_user_prompt(company_name, url))
#     result = response.text
#     display(Markdown(result))   

# ----- Copy of above, but seeing if can remove company name

def get_brochure_user_prompt(url):
    user_prompt = f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
    user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

# # -- replacing with gemini -- 
# def create_brochure(url):
#     response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
#     result = response.text
#     display(Markdown(result))      

    # Checking if can get going in gradio...
def create_brochure(url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
    result = response.text
    return result 


In [11]:
#create_brochure("Jobs", "https://www.seek.com.au/")
#create_brochure("Jobs", "https://www.seek.com.au/data-scientist-jobs")
#create_brochure("https://www.seek.com.au/data-scientist-jobs/full-time?salaryrange=120000-&salarytype=annual")


In [12]:
# Checking if can get going in gradio...

In [13]:
# -- Gradio works, but the text in the output box doesn't all fit on screen and there is no scroll bar.
gr.Interface(fn=create_brochure, inputs="textbox", outputs="textbox").launch()
#gr.Interface(fn=create_brochure, inputs="text", outputs="markdown").launch()

* Running on local URL:  http://127.0.0.1:7930

To create a public link, set `share=True` in `launch()`.




In [None]:
# -- Trying to add scroll bar...

In [None]:
# Trials (and error)
# css = """
# .scrollable-textbox {
#     max-height: 300px;
#     overflow-y: auto;
#     padding: 10px;
#     border: 1px solid #ccc;
# }
# """

# with gr.Blocks(css=css) as demo:
#     input_text = gr.Textbox(label="Input")
#     output_text = gr.Textbox(label="Output", elem_classes=["scrollable-textbox"])
#     submit_btn = gr.Button("Create Brochure")
    
#     submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

# demo.launch()

# ---------------
# css = """
# .visible-scrollbar {
#     max-height: 300px;
#     overflow-y: scroll;
#     padding: 10px;
#     border: 1px solid #ccc;
# }
# """

# with gr.Blocks(css=css) as demo:
#     input_text = gr.Textbox(label="Input")
#     output_text = gr.Textbox(label="Output", elem_classes=["visible-scrollbar"])
#     submit_btn = gr.Button("Submit")
    
#     submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

# demo.launch()

# ----------------

# # Create the Gradio interface
# iface = gr.Interface(fn=create_brochure, 
#                      inputs = gr.Textbox(label="Input"), 
#                      outputs=gr.Textbox(elem_id="output_box", 
#                                         label="Scroll through text", 
#                                         lines=10, 
#                                         max_lines=30))

# # Add custom CSS for scrolling
# iface.css = """
# #output_box {
#     max-height: 300px;
#     overflow-y: scroll;
# }
# """

# iface.launch()


# -----------------------
iface = gr.Interface(
    fn=create_brochure,
    inputs=gr.Textbox(label="Input"),
    outputs=gr.HTML(label="Scroll through text"),
    css="""
    #output-html {
        max-height: 150px;
        overflow-y: auto;
        white-space: pre-wrap;
        font-family: monospace;
        border: 1px solid #ccc;
        padding: 10px;
    }
    """
)

iface.launch()


In [None]:
# Trials (and error)
import gradio as gr
import textwrap

def create_brochure(input_text):
    # This function creates a long text to demonstrate scrolling
    long_text = input_text * 5000  # Repeat the input text 20 times
    return textwrap.fill(long_text, width=80)  # Wrap text for better formatting

iface = gr.Interface(
    fn=create_brochure,
    inputs=gr.Textbox(label="Input"),
    outputs=gr.HTML(label="Scroll through text"),
    css="""
    #output-html {
        max-height: 300px;
        overflow-y: auto;
        white-space: pre-wrap;
        font-family: monospace;
        border: 1px solid #ccc;
        padding: 10px;
    }
    """
)

iface.launch()


In [None]:
# Trials (and error)
import gradio as gr
import textwrap

def create_brochure(input_text):
    # This function creates a long text to demonstrate scrolling
    long_text = input_text * 5000  # Repeat the input text 5000 times
    return textwrap.fill(long_text, width=80)  # Wrap text for better formatting

with gr.Blocks(css="""
    #input-container { position: sticky; top: 0; background: white; z-index: 100; padding: 10px 0; }
    #output-container { margin-top: 10px; }
    #output-html { max-height: calc(100vh - 200px); overflow-y: auto; white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
""") as iface:
    with gr.Column():
        with gr.Column(elem_id="input-container"):
            input_text = gr.Textbox(label="Input")
        with gr.Column(elem_id="output-container"):
            output_text = gr.HTML(label="Scroll through text")
        submit_btn = gr.Button("Generate")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()


In [15]:
# do not edit
# 1st scroll bar, although not quite perfect

import gradio as gr
import textwrap

def create_brochure(input_text):
    long_text = input_text * 5000
    return textwrap.fill(long_text, width=80)

with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 100px; height: calc(100vh - 120px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
""") as iface:
    with gr.Column():
        with gr.Column(elem_id="input-container"):
            input_text = gr.Textbox(label="Input")
            submit_btn = gr.Button("Generate")
        with gr.Column(elem_id="output-container"):
            output_text = gr.HTML(elem_id="output-html", label="Scroll through text")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()



* Running on local URL:  http://127.0.0.1:7932

To create a public link, set `share=True` in `launch()`.




In [18]:
# Output box to high (overlapped by input box)
import gradio as gr
import textwrap

def create_brochure(input_text):
    long_text = input_text * 5000
    return textwrap.fill(long_text, width=80)

with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 80px; height: calc(100vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
    .button-container { margin-top: 10px; } /* Space above the button */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        with gr.Column(elem_id="output-container"):
            output_text = gr.HTML(elem_id="output-html", label="Scroll through text")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()


* Running on local URL:  http://127.0.0.1:7935

To create a public link, set `share=True` in `launch()`.




In [24]:
# Pretty good template - next cell to try this with actual function
import gradio as gr
import textwrap

def create_brochure(input_text):
    long_text = input_text * 5000
    return textwrap.fill(long_text, width=80)

with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 60px; height: calc(80vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 20px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        # Add space between input and output boxes
        gr.Markdown("<div style='height: 20px;'></div>")  # Spacer
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()

#output-container { margin-top: 100px; height: calc(100vh - 250px); overflow-y: auto; }

* Running on local URL:  http://127.0.0.1:7941

To create a public link, set `share=True` in `launch()`.




In [33]:
import gradio as gr
import textwrap


with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 60px; height: calc(80vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 20px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        # Add space between input and output boxes
        gr.Markdown("<div style='height: 20px;'></div>")  # Spacer
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()

* Running on local URL:  http://127.0.0.1:7942

To create a public link, set `share=True` in `launch()`.




Successfully converted string to dictionary: {'links': [{'type': 'about page', 'url': 'https://www.seek.com.au/about'}, {'type': 'careers page', 'url': 'https://www.seek.com.au/careers'}, {'type': 'company page', 'url': 'https://www.seek.com.au/companies'}]}
Found links: {'links': [{'type': 'about page', 'url': 'https://www.seek.com.au/about'}, {'type': 'careers page', 'url': 'https://www.seek.com.au/careers'}, {'type': 'company page', 'url': 'https://www.seek.com.au/companies'}]}


In [36]:
import gradio as gr

# Function that returns markdown formatted text
def markdown_function():
    return "This is **bold text** and this is **another bold part**."

# Create the Gradio interface
iface = gr.Interface(fn=markdown_function, 
                     inputs=[], 
                     outputs=gr.Markdown())

iface.launch()


* Running on local URL:  http://127.0.0.1:7944

To create a public link, set `share=True` in `launch()`.




In [37]:
pip install markdown

Collecting markdown
  Downloading Markdown-3.7-py3-none-any.whl.metadata (7.0 kB)
Downloading Markdown-3.7-py3-none-any.whl (106 kB)
Installing collected packages: markdown
Successfully installed markdown-3.7
Note: you may need to restart the kernel to use updated packages.


In [38]:
# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short brochure about the company for prospective customers, investors and recruits. \
# Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
# Include details of company culture, customers and careers/jobs if you have the information."

system_prompt = "You are an experience recrutiment and talent management assistant, who will be provided a list of roles on offer.\
You will display those roles along with a high level summary of the key steps you suggest to land those roles. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include suggested next steps on how to successfully apply for and land each of these jobs."


# def get_brochure_user_prompt(company_name, url):
#     user_prompt = f"You are looking at a company called: {company_name}\n"
#     user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
#     user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
#     user_prompt += get_all_details(url)
#     user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
#     return user_prompt

# # -- replacing with gemini -- 
# def create_brochure(company_name, url):
#     response = chat_session.send_message(system_prompt + get_brochure_user_prompt(company_name, url))
#     result = response.text
#     display(Markdown(result))   

# ----- Copy of above, but seeing if can remove company name

def get_brochure_user_prompt(url):
    user_prompt = f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
    user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

# # -- replacing with gemini -- 
# def create_brochure(url):
#     response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
#     result = response.text
#     display(Markdown(result))      


import markdown
    # Checking if can get going in gradio...
def create_brochure(url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
    result = response.text
    #return result 
    html_output = markdown.markdown(result)
    return html_output



In [39]:
import gradio as gr

with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 60px; height: calc(80vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 20px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        # Add space between input and output boxes
        gr.Markdown("<div style='height: 20px;'></div>")  # Spacer
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()


* Running on local URL:  http://127.0.0.1:7945

To create a public link, set `share=True` in `launch()`.




Successfully converted string to dictionary: {'links': [{'type': 'about page', 'url': 'https://www.seek.com.au/about'}, {'type': 'careers page', 'url': 'https://www.seek.com.au/careers'}, {'type': 'company page', 'url': 'https://www.seek.com.au/companies'}]}
Found links: {'links': [{'type': 'about page', 'url': 'https://www.seek.com.au/about'}, {'type': 'careers page', 'url': 'https://www.seek.com.au/careers'}, {'type': 'company page', 'url': 'https://www.seek.com.au/companies'}]}


## WIP Version 1

In [1]:
#pip install markdown

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import gradio as gr
import markdown



# ---- 2
# Initialize and constants
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


# ----- Replacing with Gem code -----
import os
import google.generativeai as genai
genai.configure(api_key= api_key)
# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",}

model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)

chat_session = model.start_chat(history=[  ])
#response = chat_session.send_message(message)


# ---- 3
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# ---- 4
link_system_prompt = """You are provided with a list of links found on a webpage. 
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"""

link_system_prompt += """The quality of your answer will be judged by the following criteria;
                        - Your output will be valid dictionary, with the expected structure.
                        - Your output will NOT be 'Type of links: <class 'str'>'
                        - Your output will NOT include any leading content (i.e. "```json")
                        - Your output will NOT include any trailing content (i.e. "```")
                        - If the your output can not be parsed as valid json without error then that is a failure on your part \
                        - Your output will appear in the below format, any addtional content (as described above) is
                          strictly forbidden 
                            {
                                "links": [
                                    {"type": "about page", "url": "https://full.url/goes/here/about"},
                                    {"type": "careers page": "url": "https://another.full.url/careers"}
                                ]
                            }
                        """

# ---- 5
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

# ---- 6 
def get_links(url):
    website = Website(url)
    response = chat_session.send_message(link_system_prompt + get_links_user_prompt(website))
    response_format={"type": "json_object"}
    result = response.text

    # # ------ Adding the clean up -----
    # # -- This is the functtion to clean up
    import ast
    import re
    def extract_python_code(code_str):
        # Remove the markdown code block markers (```)
        code_str = re.sub(r'```json|```', '', code_str)

        # Remove comments from the string
        code_str = re.sub(r'#.*', '', code_str)

        # Attempt to parse the remaining code to ensure it's valid Python
        try:
            # Use the ast.parse function to check if the code is valid Python syntax
            ast.parse(code_str)
            return code_str.strip()  # Return the code if it's valid
        except SyntaxError:
            return ""  # Return an empty string if the code is invalid
        
    #print(extract_python_code(result))  
    result = extract_python_code(result)
    return result

# ---- 7
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)


# ---------------- Make sure link is dict ----------------
    # Check if stgring or dict, and convert to dict if poss
    import json

    # Example variable that could be a string or dictionary
    variable = get_links(url)

    # Check if the variable is a string
    if isinstance(variable, str):
        try:
            # Attempt to convert the string to a dictionary
            variable = json.loads(variable)
            print("Successfully converted string to dictionary:", variable)
        except json.JSONDecodeError:
            print("The string is not in a valid JSON format.")
    elif isinstance(variable, dict):
        print("The variable is already a dictionary:", variable)
    else:
        print("The variable is neither a string nor a dictionary.")
    links = variable

# ---------------- Make sure link is dict ----------------


    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# ---- 8
system_prompt = "You are an experience recrutiment and talent management assistant, who will be provided a list of roles on offer.\
You will display those roles along with a high level summary of the key steps you suggest to land those roles. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include suggested next steps on how to successfully apply for and land each of these jobs."

def get_brochure_user_prompt(url):
    user_prompt = f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
    user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt


# Checking if can get going in gradio...
def create_brochure(url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
    result = response.text
    #return result 
    html_output = markdown.markdown(result)
    return html_output

# ---- 9
# Gradio output
with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 60px; height: calc(80vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 10px; }
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 20px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        # Add space between input and output boxes
        gr.Markdown("<div style='height: 20px;'></div>")  # Spacer
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()




* Running on local URL:  http://127.0.0.1:7930

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\Tim_S\Desktop\bt\AIEng\llm_engineering\.venv\Lib\site-packages\gradio\queueing.py", line 625, in process_events
    response = await route_utils.call_process_api(
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Tim_S\Desktop\bt\AIEng\llm_engineering\.venv\Lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Tim_S\Desktop\bt\AIEng\llm_engineering\.venv\Lib\site-packages\gradio\blocks.py", line 2047, in process_api
    result = await self.call_function(
             ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Tim_S\Desktop\bt\AIEng\llm_engineering\.venv\Lib\site-packages\gradio\blocks.py", line 1594, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Tim_S\Desktop\bt\AIEng\ll

In [7]:
# Can the above be siplified, by removing lots of the url and json stuff and just keeping the below...
# ---- 3
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
    
# Example URL to scrape
#url = 'https://www.seek.com.au/data-scientist-jobs/full-time?salaryrange=120000-&salarytype=annual'  # Replace this with the actual URL you want to scrape
url = 'https://au.indeed.com/jobs?q=data%20science&l=Sydney%20NSW&from=searchOnHP%2Cwhatautocomplete'


# Create an instance of the Website class
website = Website(url)

# Get and print the webpage content (title and text)
webpage_content = website.get_contents()
print(webpage_content)    

Webpage Title:
Security Check - Indeed.com
Webpage Contents:
Find jobs
Company reviews
Find salaries
Sign in
Upload your resume
Sign in
Employers / Post Job
Find jobs
Company reviews
Find salaries
Additional Verification Required
Enable JavaScript and cookies to continue
Your Ray ID for this request is
8ff8ed435ee1689c
Need more help?
Contact us




In [6]:
# Trying above, but removing things i don' think are required...

#pip install markdown

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import gradio as gr
import markdown



# ---- 2
# Initialize and constants
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


# ----- Replacing with Gem code -----
import os
import google.generativeai as genai
genai.configure(api_key= api_key)
# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",}
model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)
chat_session = model.start_chat(history=[  ])


# ---- 3
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# ---- 4
# link_system_prompt = """You are provided with a list of links found on a webpage. 

# ---- 6 
# def get_links(url):

# ---- 7
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    return result

# ---- 8
system_prompt = "You are an experience recrutiment and talent management assistant, who will be provided a list of roles on offer.\
You will display those roles along with a high level summary of the key steps you suggest to land those roles. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include suggested next steps on how to successfully apply for and land each of these jobs."

# def get_brochure_user_prompt(url):
#     user_prompt = f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company."
#     user_prompt += f"Please provide output as if it were a professional brochure with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
#     user_prompt += get_all_details(url)
#     user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
#     return user_prompt

def get_brochure_user_prompt(url):
    user_prompt = f"Here are the contents of your recruitment search. Please list out individual roles and your best advise on landing those roles."
    user_prompt += f"Please provide output in a professional style with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt


# Checking if can get going in gradio...
def create_brochure(url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
    result = response.text
    #return result 
    html_output = markdown.markdown(result)
    return html_output

# ---- 9
# Gradio output
with gr.Blocks(css="""
    #input-container { position: fixed; top: 0; left: 0; right: 0; background: white; z-index: 100; padding: 10px; }
    #output-container { margin-top: 60px; height: calc(80vh - 200px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 5px; line-height: 1.2;}
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 20px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column():
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        # Add space between input and output boxes
        gr.Markdown("<div style='height: 20px;'></div>")  # Spacer
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()




* Running on local URL:  http://127.0.0.1:7935

To create a public link, set `share=True` in `launch()`.




## Final

In [None]:
# Adding header and description + removed all redundant code

In [9]:
# -- This is here to check what data comes back from websites
# So far looks like 
# - works for: Seek
# - sort of works for: Careerone
# - doesn't work for: Indeed

# Can the above be siplified, by removing lots of the url and json stuff and just keeping the below...
# ---- 3
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
    
# Example URL to scrape
#url = 'https://www.seek.com.au/data-scientist-jobs/full-time?salaryrange=120000-&salarytype=annual'  # Replace this with the actual URL you want to scrape
#url = 'https://au.indeed.com/jobs?q=data%20science&l=Sydney%20NSW&from=searchOnHP%2Cwhatautocomplete'
url = 'https://www.careerone.com.au/data-scientist-jobs-in-information-technology/in-australia'

# Create an instance of the Website class
website = Website(url)

# Get and print the webpage content (title and text)
webpage_content = website.get_contents()
print(webpage_content)    


Webpage Title:
Data Scientist Information Technology (IT) Jobs In Australia - Jan 2025 | CareerOne
Webpage Contents:
Discover Jobs
Hiring site
Resume writing
New
Career advice
Login
Sign up
Discover Jobs
Hiring site
Career advice
Resume writing
New
Login
Search jobs
in all categories
in
all categories
Done
DONE
Search
All filters
Work type
Contract type
Pay range
Posted within
Save search
Data Scientist Information Technology (IT) jobs in Australia
• The average pay for Data Scientist Information Technology (IT) jobs in Australia is
$118K per year.
• Entry-level positions start at
$62K per year
, while the most experienced workers can earn up to
$284K per year.
•
25 companies
are currently advertising Data Scientist Information Technology (IT) jobs in Australia including
Macquarie Group
,
McKinsey & Company
,
SKL Actuarial
,
Coles
and
Deloitte
.
• Top skills include MACHINE LEARNING, DATA SCIENCE, SQL, TABLEAU SOFTWARE and AWS.
Showing
Jobs
Search jobs
Resume writing
Career advice
Post

In [14]:

#pip install markdown

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
import gradio as gr
import markdown

# ---- 2
# Initialize and constants
load_dotenv()
api_key = os.getenv('OPENAI_API_KEY')


# ----- Replacing with Gem code -----
import os
import google.generativeai as genai
genai.configure(api_key= api_key)
# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",}
model = genai.GenerativeModel(model_name="gemini-1.5-flash",
  generation_config=generation_config,)
chat_session = model.start_chat(history=[  ])


# ---- 3
# A class to represent a Webpage
# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


# ---- 7
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    return result

# ---- 8
system_prompt = "You are an experience recrutiment and talent management assistant, who will be provided a list of roles on offer.\
You will display those roles along with a high level summary of the key steps you suggest to land those roles. \
Output is to be in markdown (i.e. a professional format, with bold headders, proper spacing between different sections, etc.)\
Include suggested next steps on how to successfully apply for and land each of these jobs."

def get_brochure_user_prompt(url):
    user_prompt = f"Here are the contents of your recruitment search. Please list out individual roles and your best advise on landing those roles."
    user_prompt += f"Please provide output in a professional style with bold text for headings, content nicely layed out under headings, different content split out into sections, etc.)\n"
    user_prompt += get_all_details(url)
    #user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    user_prompt = user_prompt[:7_500] # Truncate if more than 5,000 characters
    return user_prompt


# Checking if can get going in gradio...
def create_brochure(url):
    response = chat_session.send_message(system_prompt + get_brochure_user_prompt(url))
    result = response.text
    #return result 
    html_output = markdown.markdown(result)
    return html_output

# ---- 9
# Gradio output
with gr.Blocks(css="""
    #header-container { text-align: left; position: fixed; top: 10px; left: 0; padding: 10px; background-color: #f0f0f0; }
    #input-container { text-align: left; position: fixed; top: 100px; left: 0; right: 0; background: white; z-index: 100; padding: 8px; line-height: 0.5;}
    #output-container { margin-top: 160px; height: calc(100vh - 280px); overflow-y: auto; }
    #output-html { white-space: pre-wrap; font-family: monospace; border: 1px solid #ccc; padding: 5px; line-height: 1.2;}
    .button-container { margin-top: 10px; } /* Space above the button */
    .output-label { margin-top: 10px; font-weight: bold; } /* Style for output label */
""") as iface:
    with gr.Column(elem_id="main-container"):
        # Add header and description
        with gr.Row(elem_id="header-container"):
            gr.Markdown("# Job seeker guide")
            gr.Markdown("1.0 Works best with recruitment site https://www.seek.com.au/ (but can try others).")
            gr.Markdown("2.0 Search for jobs of your choice, copy URL from that search & paste in input field below to get helpful advice on how to land those roles.")


        
        with gr.Row(elem_id="input-container"):
            input_text = gr.Textbox(label="Input", elem_id="input-box")
        
        with gr.Column(elem_id="output-container"):
            output_label = gr.Markdown("<div class='output-label'>Output:</div>")
            output_text = gr.HTML(elem_id="output-html")
        
        # Move the button below the output box
        submit_btn = gr.Button("Generate", elem_id="generate-button", elem_classes="button-container")
    
    submit_btn.click(fn=create_brochure, inputs=input_text, outputs=output_text)

iface.launch()



* Running on local URL:  http://127.0.0.1:7941

To create a public link, set `share=True` in `launch()`.


