# Source links for populating the database:
- **VCell Tutorials: (HTML pages)** https://vcell.org/webstart/VCell_Tutorials/VCell_Help/index.html
- **VCell Tutorials: (PDF files)**
  - https://vcell.org/webstart/VCell_Tutorials/
  - https://vcell.org/webstart/VCell_Tutorials/7.7/

---
# Downloading all HTML Files

In [3]:
import requests

In [4]:
!pwd

/home/kacem/projects/VCell-GSoC/backend


In [10]:
# Getting html links
response = requests.get("https://vcell.org/webstart/VCell_Tutorials/VCell_Help/index.html")
if response.status_code == 200:
    html_content = response.text
    print(html_content)

<h2>VCell Desktop Help</h2>
<ul>
<li>
  <a href="topics/ch_1/Introduction/GeneralOverview.html">Introduction</a>
  <ul>
    <li><a href="topics/ch_1/Introduction/General.html">Overview</a></li>
    <li><a href="topics/ch_1/Introduction/GeneralNavAndWinSetup.html">General Navigation and Window Setup</a></li>
    <li><a href="topics/ch_1/Introduction/TopMenu.html">Top Menu</a></li>
    <li><a href="topics/ch_1/Introduction/Login.html">User Login</a></li>
    <li>
      <a href="topics/ch_1/Introduction/File.html">File</a>
      <ul>
        <li><a href="topics/ch_1/Introduction/New.html">File -> New</a></li>
        <li><a href="topics/ch_1/Introduction/Open.html">File -> Open</a></li>
        <li><a href="topics/ch_1/Introduction/SaveAs.html">File -> Save As</a></li>
        <li><a href="topics/ch_1/Introduction/RevertToSaved.html">File -> Revert To Saved</a></li>
        <li><a href="topics/ch_1/Introduction/Permissions.html">File -> Permissions</a></li>
        <li><a href="topics/ch_

In [12]:
# Get all hrefs for determining html file paths
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, 'html.parser')

# Extract all <a> tags and get the href attribute
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
hrefs

['topics/ch_1/Introduction/GeneralOverview.html',
 'topics/ch_1/Introduction/General.html',
 'topics/ch_1/Introduction/GeneralNavAndWinSetup.html',
 'topics/ch_1/Introduction/TopMenu.html',
 'topics/ch_1/Introduction/Login.html',
 'topics/ch_1/Introduction/File.html',
 'topics/ch_1/Introduction/New.html',
 'topics/ch_1/Introduction/Open.html',
 'topics/ch_1/Introduction/SaveAs.html',
 'topics/ch_1/Introduction/RevertToSaved.html',
 'topics/ch_1/Introduction/Permissions.html',
 'topics/ch_1/Introduction/FieldData.html',
 'topics/ch_1/Introduction/Import.html',
 'topics/ch_1/Introduction/ImportBNGL.html',
 'topics/ch_1/Introduction/Export.html',
 'topics/ch_1/Introduction/Account.html',
 'topics/Auth0/Auth0Overview.html',
 'topics/Auth0/Auth0ProfilePage.html',
 'topics/Auth0/Auth0ExistingUserAccountLinking.html',
 'topics/Auth0/Auth0ReturningUserAccountLinking.html',
 'topics/Auth0/Auth0NewUserAccountCreation.html',
 'topics/ch_1/Introduction/SetProxy.html',
 'topics/ch_1/Introduction/Vi

In [13]:
# prepend base URL to relative links
base_url = "https://vcell.org/webstart/VCell_Tutorials/VCell_Help/"
full_links = [base_url + href if not href.startswith("http") else href for href in hrefs]

full_links

['https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/GeneralOverview.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/General.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/GeneralNavAndWinSetup.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/TopMenu.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/Login.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/File.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/New.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/Open.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/SaveAs.html',
 'https://vcell.org/webstart/VCell_Tutorials/VCell_Help/topics/ch_1/Introduction/RevertToSaved.html',
 'https://vcell.org/webst

In [16]:
# Create a directory for downloads (saving these html files as cleaned text files)
import os

output_dir = "Downloads"
os.makedirs(output_dir, exist_ok=True)

In [19]:
# Function to sanitize filename
def sanitize_filename(url_path):
    return url_path.strip('/').replace('/', '__') + ".txt"

In [21]:
# Process each HTML page
for url in full_links:
    response = requests.get(url)
    if response.status_code != 200:
        print(f"🟥 🟥 🟥 Failed to fetch: {url}")
        continue

    soup = BeautifulSoup(response.text, 'html.parser')

    # Remove scripts/styles/metadata
    for tag in soup(['script', 'style', 'head', 'title', 'meta', '[document]']):
        tag.extract()

    text = soup.get_text(separator='\n', strip=True)

    filename = sanitize_filename(url)
    filepath = os.path.join(output_dir, filename)
    with open(filepath, 'w', encoding='utf-8') as f:
        f.write(text)

    print(f"Saved text to: {filepath}")

Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__GeneralOverview.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__General.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__GeneralNavAndWinSetup.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__TopMenu.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__Login.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__File.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__New.html.txt
Saved text to: Downloads/https:____vcell.org__webstart__VCell_Tutorials__VCel

In [22]:
print(f"Saved {len(full_links)} files to /Downloads")

Saved 142 files to /Downloads


---
# Downloading all PDF Files
#### **Let's start with the files at : https://vcell.org/webstart/VCell_Tutorials/7.7/** 

In [24]:
# Getting html links
response = requests.get("https://vcell.org/webstart/VCell_Tutorials/7.7/")
if response.status_code == 200:
    html_content = response.text
    print(html_content)

<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
<html>
 <head>
  <title>Index of /webstart/VCell_Tutorials/7.7</title>
 </head>
 <body>
<h1>Index of /webstart/VCell_Tutorials/7.7</h1>
  <table>
   <tr><th valign="top"><img src="/icons/blank.gif" alt="[ICO]"></th><th><a href="?C=N;O=D">Name</a></th><th><a href="?C=M;O=A">Last modified</a></th><th><a href="?C=S;O=A">Size</a></th><th><a href="?C=D;O=A">Description</a></th></tr>
   <tr><th colspan="5"><hr></th></tr>
<tr><td valign="top"><img src="/icons/back.gif" alt="[PARENTDIR]"></td><td><a href="/webstart/VCell_Tutorials/">Parent Directory</a>       </td><td>&nbsp;</td><td align="right">  - </td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/layout.gif" alt="[   ]"></td><td><a href="VCell%20Quick%20Guide_%20Image%20Based%20Geometry%207.7.pdf">VCell Quick Guide_ I..&gt;</a></td><td align="right">2025-07-17 15:02  </td><td align="right">1.6M</td><td>&nbsp;</td></tr>
<tr><td valign="top"><img src="/icons/layout.gif" alt

In [25]:
# Get all hrefs for determining html file paths
from bs4 import BeautifulSoup

soup = BeautifulSoup(response.text, 'html.parser')

# Extract all <a> tags and get the href attribute
hrefs = [a['href'] for a in soup.find_all('a', href=True)]
hrefs

['?C=N;O=D',
 '?C=M;O=A',
 '?C=S;O=A',
 '?C=D;O=A',
 '/webstart/VCell_Tutorials/',
 'VCell%20Quick%20Guide_%20Image%20Based%20Geometry%207.7.pdf',
 'VCell%20Tutorial_%20Constructive%20Solid%20Geometry_%20Dendritic%20Spine%207.7.pdf',
 'VCell%20Tutorial_%20Image-Based%20Geometry%207.7.pdf',
 'VCell%20Tutorial_%20Model%20Physiology%207.7.pdf',
 'VCell%20Tutorial_%20Rule-Based%20EGFR%207.7.pdf',
 'VCell%20Tutorial_%20Rule-Based%20Ran%20Transport%207.7.pdf']

In [27]:
pdf_files = [path for path in hrefs if 'pdf' in path]
pdf_files

['VCell%20Quick%20Guide_%20Image%20Based%20Geometry%207.7.pdf',
 'VCell%20Tutorial_%20Constructive%20Solid%20Geometry_%20Dendritic%20Spine%207.7.pdf',
 'VCell%20Tutorial_%20Image-Based%20Geometry%207.7.pdf',
 'VCell%20Tutorial_%20Model%20Physiology%207.7.pdf',
 'VCell%20Tutorial_%20Rule-Based%20EGFR%207.7.pdf',
 'VCell%20Tutorial_%20Rule-Based%20Ran%20Transport%207.7.pdf']

In [29]:
# prepend base URL to relative links
base_url = "https://vcell.org/webstart/VCell_Tutorials/7.7/"
pdf_links = [base_url + href if not href.startswith("http") else href for href in pdf_files]

pdf_links

['https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Quick%20Guide_%20Image%20Based%20Geometry%207.7.pdf',
 'https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Tutorial_%20Constructive%20Solid%20Geometry_%20Dendritic%20Spine%207.7.pdf',
 'https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Tutorial_%20Image-Based%20Geometry%207.7.pdf',
 'https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Tutorial_%20Model%20Physiology%207.7.pdf',
 'https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Tutorial_%20Rule-Based%20EGFR%207.7.pdf',
 'https://vcell.org/webstart/VCell_Tutorials/7.7/VCell%20Tutorial_%20Rule-Based%20Ran%20Transport%207.7.pdf']

In [33]:
from urllib.parse import urlparse, unquote

# Download each PDF
for url in pdf_links:
    filename = os.path.basename(urlparse(url).path)
    filename = unquote(filename)  # Decode URL-encoded characters (e.g. %20 to space)
    filepath = os.path.join(output_dir, filename)

    print(f"Downloading {filename}...")
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise error for 404, etc.
        with open(filepath, 'wb') as f:
            f.write(response.content)
        print(f"Saved to: {filepath}")
    except Exception as e:
        print(f"Failed to download {url}: {e}")

Downloading VCell Quick Guide_ Image Based Geometry 7.7.pdf...
Saved to: Downloads/VCell Quick Guide_ Image Based Geometry 7.7.pdf
Downloading VCell Tutorial_ Constructive Solid Geometry_ Dendritic Spine 7.7.pdf...
Saved to: Downloads/VCell Tutorial_ Constructive Solid Geometry_ Dendritic Spine 7.7.pdf
Downloading VCell Tutorial_ Image-Based Geometry 7.7.pdf...
Saved to: Downloads/VCell Tutorial_ Image-Based Geometry 7.7.pdf
Downloading VCell Tutorial_ Model Physiology 7.7.pdf...
Saved to: Downloads/VCell Tutorial_ Model Physiology 7.7.pdf
Downloading VCell Tutorial_ Rule-Based EGFR 7.7.pdf...
Saved to: Downloads/VCell Tutorial_ Rule-Based EGFR 7.7.pdf
Downloading VCell Tutorial_ Rule-Based Ran Transport 7.7.pdf...
Saved to: Downloads/VCell Tutorial_ Rule-Based Ran Transport 7.7.pdf


---
# Upload all files to knowledge base (PDF and TXT)

In [40]:
# Path to the local Downloads directory under current working directory
downloads_dir = os.path.join(os.getcwd(), "Downloads")
downloads_dir

'/home/kacem/projects/VCell-GSoC/backend/Downloads'

In [41]:
# Upload endpoints
pdf_url = "http://127.0.0.1:8000/kb/upload-pdf"
text_url = "http://127.0.0.1:8000/kb/upload-text"

In [42]:
# Iterate over files in Downloads
for filename in os.listdir(downloads_dir):
    filepath = os.path.join(downloads_dir, filename)

    if not os.path.isfile(filepath):
        continue  # Skip directories

    file_ext = os.path.splitext(filename)[1].lower()

    try:
        if file_ext == ".pdf":
            with open(filepath, 'rb') as f:
                files = {'file': (filename, f, 'application/pdf')}
                response = requests.post(pdf_url, files=files)
                print(f"[PDF] {filename} → {response.status_code}")
                print(response.json())

        elif file_ext == ".txt" or file_ext == ".html" or file_ext == ".html.txt":
            with open(filepath, 'rb') as f:
                files = {'file': (filename, f, 'text/plain')}
                response = requests.post(text_url, files=files)
                print(f"[TXT] {filename} → {response.status_code}")
                print(response.json())

        else:
            print(f"[SKIP] Unsupported file type: {filename}")

    except Exception as e:
        print(f"[ERROR] {filename} → {e}")

[TXT] https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_3__BioModelApplications__Simulations__simulations.html.txt → 200
{'status': 'success', 'message': 'Text file https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_3__BioModelApplications__Simulations__simulations.html.txt uploaded successfully with 5 chunks.'}
[TXT] https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__Tools.html.txt → 200
{'status': 'success', 'message': 'Text file https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_1__Introduction__Tools.html.txt uploaded successfully with 1 chunks.'}
[TXT] https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_4__ParametersAndFunctions__PredefinedConstAndMathFns.html.txt → 200
{'status': 'success', 'message': 'Text file https:____vcell.org__webstart__VCell_Tutorials__VCell_Help__topics__ch_4__ParametersAndFunctions__PredefinedConstAndMathFns.html.txt uploaded successfully 