# **indexfungorum_spider** 

<span style="color:blue"> A simple python spider written for extracting information from Index Fungorum automatically </span>

<span style="color:red">红色文字</span>
<span style="color:#00FF00">绿色文字（十六进制）</span>
<span style="color:blue;font-weight:bold">蓝色加粗文字</span>

[Index Fungorum](https://indexfungorum.org/Names/Names.asp) is a dynamic webpage, making it impossible to directly scrape certain information using Python.

[Selenium](https://www.selenium.dev/zh-cn/documentation/) provides an interactive way for humans to interface with Index Fungorum through user input.

```mermaid
flowchart LR
    A[Open Indexfungorum Search Pages] --> B[Input a group name and search]
    B --> C[Get pages, parse records and get each record's page]
    C --> D[Open each record's page]
    D --> E[Extract needed information]
    E --> F{Check if species has\nmolecular data in NCBI?}
    F -->|Yes| G[Include in output]
    F -->|No| H[Exclude from output]
    G --> I[Output to xlsx file]
    H --> I
```

## Load Python packages and define functions

### Packages

In [23]:
import re
import os
import sys
import time
import pandas as pd
import requests

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from Bio import Entrez
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

from tqdm import tqdm # Providing progress bar,to monitor the program’s progress if the group contains thousands of records.

### Functions

#### Parse result page for genus search

In [9]:
def parse_search_page(html_doc):
    RED = '\033[31m'
    GREEN = '\033[32m'
    YELLOW = '\033[33m'
    RESET = '\033[0m'  # Reset color to default
    
    # Regex pattern to extract content between 'BofP</a>' and '<br><p><b>Pages:'
    pattern = r'BofP</a>(.*?)<br><p><b>Pages:'
    matches = re.findall(pattern, html_doc, re.DOTALL)
    
    if len(matches) == 1:
        record_lst = matches[0].split("<br>")
    else:
        sys.exit(f"Failed to extract the expected content. Check function: parse_search_page")

    # List to hold parsed record information
    records = []

    # Iterate through the extracted record lines
    for line in record_lst:
        # Regex pattern to extract record name and link
        try:
            record_name_link_pattern = r'href="(NamesRecord.asp\?RecordID=\d+)">([^<]+)'  # Match the record name and link
            match0 = re.search(record_name_link_pattern, line, re.DOTALL)
            record_name = match0.group(2)  # Get the record name
            record_link = "https://indexfungorum.org/Names/" + match0.group(1)  # Construct the full URL
        except:
            print(f"{RED}[Warning]{RESET} | {GREEN}{line}{RESET}")
            record_name = "NA"
            record_link = "NA"

        # Regex pattern to extract author and year
        try:
            author_year_pattern = r'</a>(.*?) (\d{4})'  # Match author and year after closing </a>
            match1 = re.search(author_year_pattern, line, re.DOTALL)
            record_author = match1.group(1).replace("&amp;","&").strip()  # Get the author
            year = match1.group(2)  # Get the year
            #print(record_author,"|",year)
        except:
            print(f"{RED}[Warning]{RESET} | {GREEN}{line}{RESET}")
            record_author = "NA"
            year = "NA"

        # Regex pattern to extract current name and link
        try:
            current_name_link_pattern = r'(http://www.speciesfungorum.org/Names/SynSpecies.asp\?RecordID=\d+)">([^<]+)'  # Match current name and link
            match2 = re.search(current_name_link_pattern, line)
            current_name = match2.group(2)  # Get the current name
            current_name_link = match2.group(1)  # Get the current name link
            #print(current_name,"|",current_name_link)
        except:
            print(f"{RED}[Warning]{RESET} | {GREEN}{line}{RESET}")
            current_name = "NA"
            current_name_link = "NA"

        # Append the record information as a list
        records.append([record_name, record_author, year, record_link, current_name, current_name_link])

    return records

#### Search by genus name

In [10]:
def search(group_name):
    '''Input a record name and search for it.'''
    
    # Locate and clear the search box, then enter the search term
    search_box = driver.find_element(By.NAME, 'SearchTerm')
    search_box.clear()
    search_box.send_keys(group_name)  # Use the correct variable (group_name)
    
    # Click the search button to initiate the search
    search_btn = driver.find_element(By.CSS_SELECTOR, '[type="submit"]')
    search_btn.click()
    
    # Wait for the page to load
    # time.sleep(5)

    try:
        # Extract the number of records displayed on the page
        number_of_records = driver.find_element(By.XPATH, '/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/p[1]/b[3]').text.split()[1]
        number_of_records = int(number_of_records)
    except Exception as e:
        print(f"Error: Failed to retrieve the number of records. {e}")
        return []

    # Calculate the number of pages required to display all records
    num_page = (number_of_records + 199) // 200  # This simplifies the page calculation logic
    
    # Output the number of pages and records for debugging
    print(f'Number of pages   : {num_page}', file=sys.stdout, flush=True)
    print(f'Number of records : {number_of_records}', file=sys.stdout, flush=True)
    
    #time.sleep(2)

    # Initialize a list to store all records
    all_records_lst = []

    # Parse the first page
    html = driver.page_source
    records_lst = parse_search_page(html)
    all_records_lst.extend(records_lst)
    
    # Parse additional pages, if any
    if num_page >= 2:
        for i in range(2, num_page + 1):
            try:
                # Locate and click the pagination button for the next page
                css_pattern = f'[href="Names.asp?pg={i}"]'
                page_btn = driver.find_element(By.CSS_SELECTOR, css_pattern)
                page_btn.click()

                # Wait for the next page to load
                #time.sleep(5)

                # Parse the page source for records
                html = driver.page_source
                records_lst = parse_search_page(html)
                all_records_lst.extend(records_lst)
            except:
                sys.exit(1)

    # Check if the number of records matches the expected count
    if len(all_records_lst) != number_of_records:
        print('Warning: Some records did not contain links.')
    
    # Return the list of all records
    return all_records_lst

#### Search by species name

In [29]:
def search_species(search_species):  # Fixed typo in parameter name (seach_species -> search_species)
    '''Input a record name and search for it.'''
    
    try:
        # Wait for and locate the search box
        search_box = WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.NAME, 'SearchTerm'))
        )
        search_box.clear()
        search_box.send_keys(search_species)  # Using correct variable name now
        
        # Find and click the search button
        search_btn = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.CSS_SELECTOR, '[type="submit"]'))
        )
        search_btn.click()
        
        # Wait for results to load - better than static time.sleep()
        WebDriverWait(driver, 10).until(
            EC.presence_of_element_located((By.CLASS_NAME, "LinkColour1"))
        )
        
        links = driver.find_elements(By.CLASS_NAME, "LinkColour1")
        
        for link in links:
            href = link.get_attribute("href")
            species_name = link.text.strip()
            
            if not species_name:
                continue
                
            print(f"Found link: {href} - Species: {species_name}")
            
            if search_species.lower() == species_name.lower():  # Case-insensitive comparison
                print(f"Exact match found: {href}")
                return href  # Return the matching URL
        
        print(f"No exact match found for '{search_species}'")
        return None
        
    except Exception as e:
        print(f"Error during search: {str(e)}")
        return None

In [11]:
def check_nucleotide_records(df):
    Entrez.email = "chenyanpeng1992@outlook.com"
    unique_term_nucleotide_count_dict = {}
    current_name_lst = list(set(df["Current_name"]))
    
    for search_nucleotide_term in tqdm(current_name_lst,desc="query NCBI nucleotide"):
        search_nucleotide_term = " ".join(search_nucleotide_term.split()[0:2])
        if search_nucleotide_term not in unique_term_nucleotide_count_dict:
            handle = Entrez.esearch(db="nucleotide", term=search_nucleotide_term)
            records = Entrez.read(handle)
            unique_term_nucleotide_count_dict[search_nucleotide_term] = records['Count']
    df['Nt_count'] = df['Current_name'].map(unique_term_nucleotide_count_dict)
    return df

In [12]:
def output(search_term, df):
    # Export the DataFrame to Excel
    time_suffix = time.strftime('%Y%m%d')
    output_file = search_term + '_indexfungorum_records_' + str(time_suffix) + '.xlsx'
    df.to_excel(output_file, index=False)

####  Launch Indexfungorum search page

In [28]:
def start_search_page():
    s = Service(r'msedgedriver.exe')
    option = webdriver.EdgeOptions()
    
    #option.add_argument('--headless')  # hide search window
    #driver = webdriver.Edge(service=s, options = option)
    driver = webdriver.Edge(service=s)
    
    #driver.implicitly_wait(5) # active Edge searching engine and 5s for the browser to load the dat
    driver.get('http://www.indexfungorum.org/')
    
    search_index_fungorum_element = driver.find_element(By.CSS_SELECTOR, '[href="./Names/Names.asp"]')
    search_index_fungorum_element.click()
    search_box = driver.find_element(By.NAME, 'SearchTerm')
    search_box.clear()

## Use

### Species records by genus name

In [None]:
search_term = 'Diaporthe'
records_lst = search(search_term)
driver.close()

### Indexfungorum ID

#### Single search

In [30]:
search_species("Peroneutypa leucaenae")

Found link: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=558722 - Species: Peroneutypa leucaenae
Exact match found: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=558722


'https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=558722'

#### Batch search

In [33]:
df = pd.read_excel("D:\BaiduSyncdisk\PhD_Thesis\FD_paper\sordariomycetes_source_table_sequence_upload.xlsx", sheet_name="genbank")

In [34]:
df.head()

Unnamed: 0,Sequence_ID,HUEST,UESTCC,Organism,ITS,LSU,SSU,ACT,HIS3,CHS1,CAL,RPB1,GAPDH,RPB2,TEF1
0,CC.HLC65,HUEST 24.0022,UESTCC 24.0021,Coniochaeta fibrosae,PP407752,PP407616,,,,,,,,,
1,CC.LTG23,HUEST 24.0021,UESTCC 24.0020,Coniochaeta acaciae,PP407753,PP407617,,,,,,,,,
2,CC.CBG09,HUEST 24.0020,UESTCC 24.0019,Coniella quercicola,PP407754,PP407618,,,,,,,,,PP555286
3,CC.HLG74.1,HUEST 24.0105,UESTCC 24.0100,Synnemasporella aculeans,PP407755,PP407619,,,,,,,,,
4,CC.HLG95.1,HUEST 24.0106,UESTCC 24.0101,Synnemasporella sichuanensis,PP407756,PP407620,,,,,,,,,


In [None]:
for search_term in df['Organism'].to_list():
    search_species(search_term)

Found link: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=839390 - Species: Coniochaeta fibrosae
Exact match found: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=839390
Found link: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=553912 - Species: Coniochaeta acaciae
Exact match found: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=553912
Found link: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=817831 - Species: Coniella quercicola
No exact match found for 'Coniella quercicola '
Found link: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=823996 - Species: Synnemasporella aculeans
Exact match found: https://www.indexfungorum.org/Names/NamesRecord.asp?RecordID=823996
Error during search: Message: 
Stacktrace:
	GetHandleVerifier [0x005A0E73+39155]
	Microsoft::Applications::Events::time_ticks_t::time_ticks_t [0x0043E386+772070]
	Microsoft::Applications::Events::ILogConfiguration::operator* [0x001F48EE+5182

In [None]:
df = pd.DataFrame(records_lst, columns=['Name', 'Author', 'Year', 'Name_link', 'Current_name', 'Synonym_link'])
df['Current_name'] = df.apply(
    lambda row: " ".join(row['Name'].split()[0:2]) if row['Current_name'] == "NA" else row['Current_name'],
    axis=1
)

In [None]:
df.to_excel("Diaporthe_indexfungorum.2025.3.25.xlsx")

## Query Genbank

In [None]:
df = check_nucleotide_records(df)

## Processing table

In [None]:
count = len(df[df['Current_name'].apply(lambda x: x.split()[0] != search_term)])
print(f"{count} name records were revised.")

In [None]:
current_count = len(df[df['Current_name'].apply(lambda x: x.split()[0] == search_term)])
print(f"{current_count} name records are valid right now.")
df[df['Current_name'].apply(lambda x: x.split()[0] == search_term)]["Current_name"].tolist()