# Install Selenium

In [None]:
# Imports for selenium install
import os
import re
import subprocess
import requests

# The deb files we need to install
deb_files_startstwith = [
    "chromium-codecs-ffmpeg-extra_",
    "chromium-codecs-ffmpeg_",
    "chromium-browser_",
    "chromium-chromedriver_"
]

def get_latest_version() -> str:
    # A request to security.ubuntu.com for getting latest version of chromium-browser
    # e.g. "112.0.5615.49-0ubuntu0.18.04.1_amd64.deb"
    url = "http://security.ubuntu.com/ubuntu/pool/universe/c/chromium-browser/"
    r = requests.get(url)
    if r.status_code != 200:
        raise Exception("status_code code not 200!")
    text = r.text

    # Find latest version
    pattern = '<a\shref="chromium\-browser_([^"]+.ubuntu0\.18\.04\.1_amd64\.deb)'
    latest_version_search = re.search(pattern, text)
    if latest_version_search:
        latest_version = latest_version_search.group(1)
    else:
        raise Exception("Can not find latest version!")
    return latest_version

def download(latest_version: str, quiet: bool):
    deb_files = []
    for deb_file in deb_files_startstwith:
        deb_files.append(deb_file + latest_version)

    for deb_file in deb_files:
        url = f"http://security.ubuntu.com/ubuntu/pool/universe/c/chromium-browser/{deb_file}"

        # Download deb file
        if quiet:
            command = f"wget -q -O /content/{deb_file} {url}"
        else:
            command = f"wget -O /content/{deb_file} {url}"
        print(f"Downloading: {deb_file}")
        # os.system(command)
        !$command

        # Install deb file
        if quiet:
            command = f"apt-get install /content/{deb_file} >> apt.log"
        else:
            command = f"apt-get install /content/{deb_file}"
        print(f"Installing: {deb_file}\n")
        # os.system(command)
        !$command

        # Delete deb file from disk
        os.remove(f"/content/{deb_file}")

def check_chromium_installation():
    try:
        subprocess.call(["chromium-browser"])
        print("Chromium installation successfull.")
    except FileNotFoundError:
        print("Chromium Installation Failed!")

def install_selenium_package(quiet: bool):
    if quiet:
        !pip install selenium -qq >> pip.log
    else:
        !pip install selenium

def main(quiet: bool):
    # Get the latest version of chromium-browser for ubuntu 18.04
    latest_version = get_latest_version()
    # Download and install chromium-browser for ubuntu 20.04
    download(latest_version, quiet)
    # Check if installation succesfull
    check_chromium_installation()
    # Finally install selenium package
    install_selenium_package(quiet)

if __name__ == '__main__':
    quiet = True # verboseness of wget and apt
    main(quiet)

Downloading: chromium-codecs-ffmpeg-extra_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-codecs-ffmpeg-extra_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-codecs-ffmpeg_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-codecs-ffmpeg_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-browser_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-browser_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Downloading: chromium-chromedriver_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb
Installing: chromium-chromedriver_112.0.5615.49-0ubuntu0.18.04.1_amd64.deb

Chromium installation successfull.


# Imports

In [None]:
%pip install markdownify

from markdownify import markdownify as md
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re



In [None]:
# Import selenium
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver import ActionChains

# Setup webdriver
webdriver_options = Options()
webdriver_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36')
webdriver_options.add_argument('--headless')
webdriver_options.add_argument('--no-sandbox')
webdriver_options.add_argument('--disable-dev-shm-usage')
service = Service(executable_path=r'/usr/bin/chromedriver')
driver = webdriver.Chrome(service=service, options=webdriver_options)

# Scrape

In [None]:
# List of URLs to scrape
urls_to_scrape = ['https://www.reddit.com/r/VeteransBenefits/w/blood',
                  'https://www.reddit.com/r/VeteransBenefits/w/bloodtubes',
                  'https://www.reddit.com/r/VeteransBenefits/w/heart',
                  'https://www.reddit.com/r/VeteransBenefits/w/NoRate',
                  'https://www.reddit.com/r/VeteransBenefits/w/MouthSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/DigSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/EndSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/FemaleSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/GentSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/blood',
                  'https://www.reddit.com/r/VeteransBenefits/w/ID',
                  'https://www.reddit.com/r/VeteransBenefits/w/Mental',
                  'https://www.reddit.com/r/VeteransBenefits/w/Ankle',
                  'https://www.reddit.com/r/VeteransBenefits/w/ElbowForearm',
                  'https://www.reddit.com/r/VeteransBenefits/w/Foot',
                  'https://www.reddit.com/r/VeteransBenefits/w/Hand',
                  'https://www.reddit.com/r/VeteransBenefits/w/HipThigh',
                  'https://www.reddit.com/r/VeteransBenefits/w/KneeLeg',
                  'https://www.reddit.com/r/VeteransBenefits/w/Ribs',
                  'https://www.reddit.com/r/VeteransBenefits/w/Skull',
                  'https://www.reddit.com/r/VeteransBenefits/w/ShoulderArm',
                  'https://www.reddit.com/r/VeteransBenefits/w/Spine',
                  'https://www.reddit.com/r/VeteransBenefits/w/Wrist',
                  'https://www.reddit.com/r/VeteransBenefits/w/MSCancer',
                  'https://www.reddit.com/r/VeteransBenefits/w/MSDiseases',
                  'https://www.reddit.com/r/VeteransBenefits/w/FaceMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/FHMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/MuscleHernia',
                  'https://www.reddit.com/r/VeteransBenefits/w/LLFootMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/BHTMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/SHMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/TNMuscles',
                  'https://www.reddit.com/r/VeteransBenefits/w/CranialN',
                  'https://www.reddit.com/r/VeteransBenefits/w/CNS',
                  'https://www.reddit.com/r/VeteransBenefits/w/LBLNerves',
                  'https://www.reddit.com/r/VeteransBenefits/w/UPANerves',
                  'https://www.reddit.com/r/VeteransBenefits/w/TBI',
                  'https://www.reddit.com/r/VeteransBenefits/w/ears',
                  'https://www.reddit.com/r/VeteransBenefits/w/eyes',
                  'https://www.reddit.com/r/VeteransBenefits/w/TasteSmell',
                  'https://www.reddit.com/r/VeteransBenefits/w/NervSystem',
                  'https://www.reddit.com/r/VeteransBenefits/w/Skin',
                  'https://www.reddit.com/r/VeteransBenefits/w/Nutritional',
                  'https://www.reddit.com/r/VeteransBenefits/w/AirSystem',
                  ]

# Initialize a session for requests
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.103 Safari/537.36'
})

In [None]:
# Helper functions
def html_to_markdown(html):
    return md(html)


def extract_area_from_id(id_url):
    # Extract the part after '/w/' and before the next '/' or '#'
    match = re.search(r'/w/([^/#]+)/?', id_url)
    return match.group(1) if match else 'Undefined'

In [None]:
def extract_details_selenium(url):
    driver.get(url)
    print("Processing url: ", url)
    time.sleep(5)  # Wait for the page to load completely

    # Use Selenium to get the page source
    page_source = driver.page_source

    # Use BeautifulSoup to parse the HTML content
    soup = BeautifulSoup(page_source, 'html.parser')
    headers_with_id = soup.find_all(['h1', 'h2'], id=True)  # Find both h1 and h2 tags with an 'id'

    # Extract the area from the URL
    area = extract_area_from_id(url)

    # Extract the content of each section
    all_extracted_sections = []
    for header in headers_with_id:
        section_content = []
        for sibling in header.next_siblings:
            if sibling.name == 'hr':
                break
            section_content.append(str(sibling))
        all_extracted_sections.append({
            'id': url + "/#" + header['id'],
            'area': area,
            'title': header.text.strip(),
            'content': html_to_markdown(''.join(section_content))  # Use markdownify if needed
        })

    return all_extracted_sections

In [None]:
# Storage for all sections' data
all_sections_data = []

# Loop through each URL and use the extract_details function to scrape content
for url in urls_to_scrape:
    sections_data = extract_details_selenium(url)
    all_sections_data.extend(sections_data)

driver.quit()  # Close the webdriver

# Create a pandas DataFrame from the list of section data
df = pd.DataFrame(all_sections_data)

Processing url:  https://www.reddit.com/r/VeteransBenefits/w/blood
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/bloodtubes
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/heart
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/NoRate
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/MouthSystem
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/DigSystem
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/EndSystem
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/FemaleSystem
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/GentSystem
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/blood
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/ID
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/Mental
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/Ankle
Processing url:  https://www.reddit.com/r/VeteransBenefits/w/ElbowForearm
Processing url:  https://

In [None]:
# Use the `str.contains` method to check for the word "Rating" in the "Content" column, case insensitive
df = df[df['content'].str.contains('Rating', case=False, na=False)]

df.head(3)

Unnamed: 0,id,area,title,content
0,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Sickle Cell Anemia (Sickle Cell Disease),\nAnemia where the red blood cells become shap...
1,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Aplastic Anemia (Bone Marrow Aplasia),\nBone marrow not making enough new blood cell...
2,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Iron Deficiency Anemia,\nOccurs when your body doesn't have enough of...


In [None]:
# Save the DataFrame to a CSV file
df.to_csv('va_masterlist.csv', index=False, encoding='utf-8-sig')

In [None]:
# Read CSV
df = pd.read_csv('va_masterlist.csv');

# Define the regular expression pattern for markdown table rows
regex_pattern = r'\|\s(\d+%)?\s\|\s([^|]+)\|'

# Function to parse markdown tables within a given text field
def parse_markdown_tables_in_text(text):
    # Find all matches in the text using the regex pattern
    matches = re.findall(regex_pattern, text)
    # Create a dictionary with percentage as keys and descriptions as values
    return {match[0]: match[1].strip() for match in matches if match[0]}

# Apply the function to the 'content' column of the dataframe
df['parsed_tables'] = df['content'].apply(parse_markdown_tables_in_text)

# Collect all unique percentages from the 'parsed_tables' column
unique_percentages = set()
for parsed_table in df['parsed_tables']:
    unique_percentages.update(parsed_table.keys())

# Sort the unique percentages numerically
sorted_percentages = sorted(unique_percentages, key=lambda x: int(x.rstrip('%')))

# Create new columns for each percentage and fill with descriptions
for percentage in sorted_percentages:
    df[percentage] = df['parsed_tables'].apply(lambda x: x.get(percentage, ""))

# Drop the 'parsed_tables' column as it's no longer needed
df.drop('parsed_tables', axis=1, inplace=True)

# Define the order of the initial columns
initial_cols = ['id', 'area', 'title', 'content']

# Combine initial columns with sorted percentage columns
ordered_cols = initial_cols + sorted_percentages

# Reorder the dataframe columns
df = df[ordered_cols]

# Display the DataFrame to verify the new column order
df.head()

Unnamed: 0,id,area,title,content,0%,10%,20%,30%,40%,50%,60%,70%,80%,90%,100%
0,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Sickle Cell Anemia (Sickle Cell Disease),\nAnemia where the red blood cells become shap...,,Symptoms no longer present but an established ...,,With 1 or 2 painful episodes per 12-month period.,,,With 3 painful episodes per 12-month period *o...,,,,With 4 or more painful episodes per 12-month p...
1,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Aplastic Anemia (Bone Marrow Aplasia),\nBone marrow not making enough new blood cell...,,Requires continuous medication.,,Requiring transfusion of platelets or red bloo...,,,Requiring transfusion of platelets or red bloo...,,,,Requiring peripheral blood\* or bone marrow st...
2,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Iron Deficiency Anemia,\nOccurs when your body doesn't have enough of...,No symptoms or can be treated by changing diet.,Requiring intravenous iron infusions 1-3 times...,,,,,,,,,Requiring intravenous iron infusions 4 or more...
3,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,"Addison's Anemia (Pernicious Anemia, Vitamin B...",\nWhen the body can't absorb enough vitamin B-...,,Requiring continuous treatment with Vitamin B1...,,,,,,,,,
4,https://www.reddit.com/r/VeteransBenefits/w/bl...,blood,Acquired Hemolytic Anemia (Extrinsic Hemolytic...,\n\n\n| Rating | Description |\n| --- | --- |\...,No symptoms.,Requiring 1 course of immunosuppressive therap...,,Requiring 2-3 courses of immunosuppressive the...,,,Requiring immunosuppressive medication 4 or mo...,,,,Requiring a bone marrow transplant\* or contin...


In [None]:
df.to_csv('va_masterlist_extended.csv', index=False, encoding='utf-8-sig')

# Create JSON database and JSONL for mongodb

In [None]:
import pandas as pd

# Load the CSV file
file_path = 'va_masterlist_extended.csv'  # Replace with your CSV file path
data = pd.read_csv(file_path)

# Creating a new column "rating" which is a dictionary of the percentage columns
percentage_columns = [f"{i}%" for i in range(0, 110, 10)]
data['rating'] = data[percentage_columns].apply(lambda row: row.dropna().to_dict(), axis=1)

# Dropping the original percentage columns
data.drop(columns=percentage_columns, inplace=True)

# Convert each row of the dataframe to a JSON line and write to a file
jsonl_file_path = 'va_masterlist_extended.jsonl'  # Replace with your desired JSONL file path
with open(jsonl_file_path, 'w') as file:
    for _, row in data.iterrows():
        file.write(row.to_json() + '\n')

In [None]:
import json

# Path to the JSONL file
jsonl_file_path = 'va_masterlist_extended.jsonl'

# Read the JSONL file and convert it to a JSON array
json_data = []
with open(jsonl_file_path, 'r') as file:
    for line in file:
        json_data.append(json.loads(line))

# Path for the output JSON file
json_file_path = 'va_masterlist.json'

# Write the JSON data to a file
with open(json_file_path, 'w') as file:
    json.dump(json_data, file)

# Mongodb (optional)

In [None]:
%pip install pymongo

In [None]:
from pymongo import MongoClient
import json

# MongoDB setup
mongodb_uri = "mongodb://mongoadmin:secret@[PUT YOUR MONGODB SERVER HERE]:[PORT]/default_db?authSource=admin"  # Replace with your MongoDB URI

client = MongoClient(mongodb_uri)
db = client["va"]  # Replace with your database name
collection = db["masterlist"]  # Replace with your collection name

# Path to the JSONL file
jsonl_file_path = 'va_masterlist_extended.jsonl'  # Replace with your JSONL file path

# Read the JSONL file and insert into MongoDB
with open(jsonl_file_path, 'r') as file:
    documents = [json.loads(line) for line in file]

# Insert documents into MongoDB
try:
    collection.insert_many(documents)
    print("Data inserted successfully!")
except Exception as e:
    print(f"An error occurred: {e}")

# Creating indexes on 'area' and 'title'
collection.create_index("area")
collection.create_index("title")
print("Indexes created on 'area' and 'title'.")

# Close the MongoDB connection
client.close()

Data inserted successfully!
Indexes created on 'area' and 'title'.
