In [10]:
import os
import requests
from bs4 import BeautifulSoup
import pickle
import numpy as np
from mistralai import Mistral
import time

# Set your API key (ensure it's kept secure)
os.environ["MISTRAL_API_KEY"] = "uPgaqejCGJ8ZU6Oe0uDmUQl1jzcFtUAv"
api_key = os.getenv("MISTRAL_API_KEY")

# Define policy URLs with descriptive names
policies = {
    "Student Conduct Policy": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/student-conduct-policy",
    "Academic Schedule Policy": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/academic-schedule-policy",
    "Sport and Wellness Facilities Policy": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and",
    "Graduate Admissions Policy": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/graduate-admissions-policy",
    "Use Library Space Policy": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/use-library-space-policy",
    "International Student Procedure": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/international-student-procedure",
    "Registration Procedure": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/registration-procedure",
    "Scholarship and Financial Assistance": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/scholarship-and-financial-assistance",
    "Library Study Room Booking Procedure": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/library-study-room-booking-procedure",
    "Graduate Final Grade Procedure": "https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/graduate-final-grade-procedure"
}

def get_policy_text(url):
    """Scrape the policy text from the given URL."""
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    # Try to locate the main content. You may need to adjust this selector.
    content = soup.find("div", class_="policy-content")
    if content:
        return content.get_text(separator=" ", strip=True)
    else:
        # Fallback: get all text
        return soup.get_text(separator=" ", strip=True)

# Scrape all policies and store their text in a dictionary
policy_texts = {}
for policy_name, url in policies.items():
    print(f"Scraping {policy_name} from {url}...")
    policy_texts[policy_name] = get_policy_text(url)

# Chunk the texts into smaller pieces. Here, we use a fixed character length.
chunk_size = 512
chunks = []
for policy_name, text in policy_texts.items():
    if not text:
        continue
    for i in range(0, len(text), chunk_size):
        chunk_text = text[i:i+chunk_size]
        chunks.append({"policy": policy_name, "text": chunk_text})

print("Total chunks created:", len(chunks))

# Save the chunks to a pickle file
with open("udst_chunks.pkl", "wb") as f:
    pickle.dump(chunks, f)
print("Saved udst_chunks.pkl")

# Function to generate an embedding for a given text using Mistral
def get_text_embedding(text, client):
    response = client.embeddings.create(model="mistral-embed", inputs=[text])
    return np.array(response.data[0].embedding)

# Initialize the Mistral client once
client = Mistral(api_key=api_key)

# Generate embeddings for each chunk
embeddings_list = []
for i, chunk in enumerate(chunks):
    try:
        embedding = get_text_embedding(chunk["text"], client)
        embeddings_list.append(embedding)
        print(f"Processed chunk {i+1}/{len(chunks)}")
        time.sleep(2)  # Wait 2 seconds between requests to avoid rate limiting
    except Exception as e:
        print(f"Error processing chunk {i+1}: {e}")


# Convert embeddings list to a NumPy array
embeddings_array = np.array(embeddings_list)

# Save the embeddings to a pickle file
with open("udst_embeddings.pkl", "wb") as f:
    pickle.dump(embeddings_array, f)
print("Saved udst_embeddings.pkl")


Scraping Student Conduct Policy from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/student-conduct-policy...
Scraping Academic Schedule Policy from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/academic-schedule-policy...
Scraping Sport and Wellness Facilities Policy from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/sport-and-wellness-facilities-and...
Scraping Graduate Admissions Policy from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/graduate-admissions-policy...
Scraping Use Library Space Policy from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/policies-and-procedures/use-library-space-policy...
Scraping International Student Procedure from https://www.udst.edu.qa/about-udst/institutional-excellence-ie/udst-policies-and-procedures/international-student-procedure...
Scraping Registration Procedure