# Conversation starters

In [None]:
import requests
import json
import glob
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import os
import requests
import markdownify
import re
import json
import shutil
from bs4 import BeautifulSoup
from openai import AzureOpenAI
from dotenv import load_dotenv

load_dotenv()

## Define variables

Here we define the course code (for example DP-203), and define a collection of learning paths.

In [None]:
code = "SC-100"

list_of_learn_paths = [
    "https://learn.microsoft.com/en-us/training/paths/sc-100-design-solutions-best-practices-priorities/",
    "https://learn.microsoft.com/en-us/training/paths/sc-100-design-operations-identity-compliance-capabilities/",
    "https://learn.microsoft.com/en-us/training/paths/sc-100-design-security-solutions-applications-data/",
    "https://learn.microsoft.com/en-us/training/paths/sc-100-design-security-solutions-infrastructure/"
]

## Retrieve learning path modules

The code below is a Python script designed to scrape data from a list of URLs, parse the HTML content, and save the extracted information into a JSON file. The script uses the `requests` library to fetch HTML content from the URLs and `BeautifulSoup` from the `bs4` library to parse the HTML.

In [None]:
data = []

for url in list_of_learn_paths:
    learn_path = requests.get(url)
    soup_learnpath = BeautifulSoup(learn_path.text, "html.parser")
    links_learnpath = soup_learnpath.find_all(class_="font-size-h6")
    title_learnpath = soup_learnpath.find("h1", class_="title").text
    absolute_urls = [urljoin(url, link["href"]) for link in links_learnpath]

    print(title_learnpath)

    jsondata_learnpath = {} 
    jsondata_learnpath["learning_path"] = title_learnpath
    jsondata_learnpath["url"] = url 
    jsondata_learnpath["learning_modules"] = []

    for module in absolute_urls:
        learn_module = requests.get(module)
        soup_learnmodule = BeautifulSoup(learn_module.text, "html.parser")
        links_units = soup_learnmodule.find_all(class_="unit-title")
        links_units = [link for link in links_units if not any(keyword in link["href"] for keyword in ["exercise", "knowledge-check", "summary"])]
        title_module = soup_learnmodule.find("h1", class_="title").text
        absolute_urls_units = [urljoin(module, link["href"]) for link in links_units]

        print("- " + title_module)

        jsondata_learnmodule = {} 
        jsondata_learnmodule["learning_module"] = title_module
        jsondata_learnmodule["url"] = module 
        jsondata_learnmodule["learning_units"] = absolute_urls_units

        jsondata_learnpath["learning_modules"].append(jsondata_learnmodule)

    data.append(jsondata_learnpath)

# Create the directory if it doesn't exist
if not os.path.exists("./temp"):
    os.makedirs("./temp")
    
# Write the data to the JSON file
with open("./temp/LearningPaths.json", "w") as json_file:
    json.dump(data, json_file, indent=2)

## Retrieve content

The code below is a Python function named `get_markdown` that retrieves and processes HTML content from a given URL, converting it into `Markdown` format and saving it to a specified location. This function relies on several external libraries, including `requests`, `BeautifulSoup`, and `markdownify`.

While scraping the learn unit, a few html tags need to be removed.

The code will process learning paths from a JSON file, create corresponding directories, and convert HTML content from specified URLs into Markdown format. The code begins by opening and reading a JSON file named LearningPaths.json located in the ./temp/ directory. This file contains a list of learning paths, each with associated learning modules and units.

In [None]:
def get_markdown(url, savelocation):
    print("- Retrieving markdown from " + url)

    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    # might need to adapt this when working with other web pages (not Microsoft Learn)
    div = soup.find(id="unit-inner-section")

    for ul in div.find_all("ul", class_="metadata"):
        ul.decompose()
    for d in div.find_all("div", class_="xp-tag"):
        d.decompose()
    for next in div.find_all("div", class_="next-section"):
        next.decompose()
    for header in div.find_all(["h1", "h2", "h3", "h4", "h5", "h6"]):
        header.string = "\n# " + header.get_text() + "\n"
    for code in div.find_all("code"):
        code.decompose()

    markdown = markdownify.markdownify(str(div), heading_style="ATX", bullets="-")
    markdown = re.sub('\n{3,}', '\n\n', markdown)
    markdown = markdown.replace("[Continue](/en-us/)", "")

    with open(savelocation, "w", encoding="utf-8") as file:
        file.write(markdown)

    return markdown

In [None]:
with open("./temp/LearningPaths.json", "r") as file:
    learning_paths = json.load(file)
    
if not os.path.exists(f"./temp/{code}"):
    os.mkdir(f"./temp/{code}")

iModule = 1
for lp in learning_paths:
    modules = [module for module in lp["learning_modules"]]
    
    for module in modules:
        outputFolder_module = f"temp/{code}/{iModule}.{module['learning_module']}"
        iModule += 1
        
        if not os.path.exists(outputFolder_module):
            os.mkdir(outputFolder_module)
            
        for index, url in enumerate(module["learning_units"]):
            unit_name = url.split("/")[-1]
            
            outputFile_md = f"{outputFolder_module}/{unit_name}.md"
            markdown = get_markdown(url, outputFile_md)

## Combine (append) all files

Once all the content has been scraped (all learning units of a learn module), we combine this into one file

In [None]:
def append_md_files(directory):
    # Get a list of all .md files in the current directory
    md_files = sorted(glob.glob(os.path.join(directory, "[!0]*.md")), key=lambda x: os.path.basename(x))
    md_files = [file for file in md_files if not file.endswith('.transcript.md')]
    output_file = os.path.join(directory, "0-Content.md")
    
    print(directory)
    print(output_file)

    # Open the output file in write mode
    with open(output_file, 'w') as outfile:
        # Iterate over each .md file
        for md_file in md_files:
            # Open each .md file in read mode
            with open(md_file, 'r') as infile:
                # Read the content and write it to the output file
                outfile.write(infile.read())
                # Optionally, add a newline or some separator between files
                outfile.write("\n\n")  # Adds a newline for separation


In [None]:
iModule = 1
for lp in learning_paths:
    modules = [module for module in lp["learning_modules"]]
    
    for module in modules:
        outputFolder_module = f"temp/{code}/{iModule}.{module['learning_module']}"
        iModule += 1
        
        append_md_files(outputFolder_module)

## Generate conversation starters

First, we create an instance op Azure Open AI. (Make sure to have the correct environment variables in a .env file)

Then, we pass in a prompt (see below) plus the content from the learn module and store the output as a markdown file.

In [None]:
client = AzureOpenAI(azure_endpoint=os.getenv("AZURE_ENDPOINT"), api_version="2024-02-15-preview", api_key=os.getenv("OPENAI_API_KEY"))

In [None]:
def generateConversationStarters(contentLocation, savelocation):
    mdFile = contentLocation + "/0-Content.md"
    print("- Generating conversation starters for " + mdFile)
    
    with open(mdFile, "r") as file:
        contents = file.read()
    
    prompt = """ 
We're going to learn a new topic today. 

Create a list of 5 open ended questions on this content that help learners become interested in the course.

Create a mindmap of all topics covered. Use a hierachical structure as an ASCII tree diagram

Create a practice assesment of up to 10 questions. It's OK to have challenging questions, do not make it too obvious. Provide the answer as well. Make sure to generate a mix of multiple choice, true/false (provide the choices), fill-in-the-blanks questions (provide suggestions to choose from).

Create one complex problem related to the training content. I will use this to encourage group discussions to solve the problem. 

============
{contents}
    """
    
    prompt = prompt.replace("{contents}", contents)
    
    message_text = [
        {"role":"system","content":prompt},
        {"role":"user","content":"Generate the questions, mindmap and practice assesment. Generate also the complex problem"}
    ]
        
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages = message_text,
        temperature=0.1,
        top_p=0.95,
        frequency_penalty=0,
        presence_penalty=0,
        stop=None
    )    
    
    output = completion.choices[0].message.content
    print(f"- Actual total usage token={completion.usage.total_tokens}")
    
    with open(savelocation, "w", encoding="utf-8") as file:
        file.write(output)

In [None]:
if not os.path.exists(f"./output"):
    os.mkdir(f"./output")
    
if not os.path.exists(f"./output/{code}"):
    os.mkdir(f"./output/{code}")
    
iModule = 1  
for lp in learning_paths:
    modules = [module for module in lp["learning_modules"]]

    for module in modules:
        contentFolder_module = f"temp/{code}/{iModule}.{module['learning_module']}"
        saveLocation = f"output/{code}/{iModule}.{module['learning_module']}.md"
        saveLocation = saveLocation.replace(":", " -")
        
        iModule += 1
        
        generateConversationStarters(contentFolder_module, saveLocation)
        #break

In [None]:
shutil.rmtree("./temp")