# Using Selenium to get information from Index Fungorum 

In [1]:
# load required packages # 加载需要的python包
import re
import os
import sys
import time
import pandas as pd

from selenium import webdriver
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from Bio import Entrez

from tqdm import tqdm

# Define function 

In [2]:
def open_indexfungorum():
    driver.get('http://www.indexfungorum.org/')

In [3]:
def to_search_page():
    '''to search page like clicking  Search indexFungorum using mouse
    '''
    search_index_fungorum_element = driver.find_element(By.CSS_SELECTOR, '[href="./Names/Names.asp"]')
    time.sleep(1)
    search_index_fungorum_element.click()

In [4]:
def search_record(record_name):
    '''input a record name and search'''
    search_box = driver.find_element(By.NAME, 'SearchTerm')
    search_box.clear()
    search_box.send_keys(record_name)
    search_btn = driver.find_element(By.CSS_SELECTOR, '[type="submit"]')
    search_btn.click()
    time.sleep(5) # Waiting the page loads
    
    number_of_records = driver.find_element(By.XPATH,'/html/body/table/tbody/tr[2]/td/table/tbody/tr/td/p[1]/b[3]').text.split()[1]
                
    number_of_records = int(number_of_records)
    if (number_of_records // 200 > 1):
        if number_of_records % 200 != 0:
            num_page = number_of_records // 200 + 1
        else:
            num_page = number_of_records // 200
    print(f'Number of pages   : {num_page}', file=sys.stdout, flush=True)
    print(f'Number of records : {number_of_records}', file=sys.stdout, flush=True)
    
    time.sleep(2)
    
    all_records_lst = []
    
    html = driver.page_source
    pattern = r'</p><a class=(.*?)<br><p>'

    result = re.search(pattern, html, re.S)
    if result:
        text_between_first_match = result.group(1).strip()
        line_lst = text_between_first_match.split('<br>')
        all_records_lst.extend(line_lst)
        #print(len([line for line in line_lst if 'NamesRecord.asp?RecordID' in line]))
    else:
        print('Error: failed in parsing Page 1')
    
    if num_page >= 2:
        for i in range(2, num_page + 1):
            css_pattern = f'[href="Names.asp?pg={i}"]'
            page_btn = driver.find_element(By.CSS_SELECTOR, css_pattern)
            page_btn.click()
            time.sleep(5)
            html = driver.page_source
            pattern = r'</p><a class=(.*?)<br><p>'

            result = re.search(pattern, html, re.S)
            if result:
                text_between_first_match = result.group(1).strip()
                line_lst = text_between_first_match.split('<br>')
                all_records_lst.extend(line_lst)
                # print(len([line for line in line_lst if 'NamesRecord.asp?RecordID' in line]))
            else:
                print(f'Error: failed in parsing Page {i}')
    if len(all_records_lst) != number_of_records:
        print('Warning: some records no Links')
    return all_records_lst   

In [5]:
def parse_each_record_line(all_records_lst):
    records_lst = []
    for line in all_records_lst:
        #print(line)
        try:
            record_label = re.search(r'>(.*?)<', line).group(1)
        except:
            record_label = ''
        #print(record_label, end='|')
        if not record_label:
            sys.exit(f'not a valid label {line}')    
        try:
            record_submitter_year = re.search(r'</a>(.*?)<a', line).group(1).replace('&amp;','and').split('(also')[0].strip().strip(',')
        except:
            record_submitter_year = ''

        try:
            record_submitter = re.sub(r'\b\d{4}\b', '', record_submitter_year).strip(';').strip()
        except:
            record_submitter = ''
        #print(record_submitter, end='|')

        try:
            record_year = re.search(r'\b\d{4}\b',record_submitter_year).group()
        except:
            record_year = ''
        #print(record_year, end='|')

        record_link = ''
        if 'NamesRecord.asp?RecordID=' in line:
            record_link = re.search(r'NamesRecord\.asp\?RecordID=\d+', line).group()
            #print(record_link,end='|')
        
        if 'SynSpecies.asp' in line:
            record_current_label = re.search(r'>(.*?)<', line.split('SynSpecies')[1]).group(1)
            #print(record_current_label)
        else:
            record_current_label = record_label
        
        label_changes = 0
        if record_label.split()[:2] != record_current_label.split()[0:2]:
            label_changes = 1

        if record_link:
            record_link = 'http://www.indexfungorum.org/Names/' + record_link
        records_lst.append([record_label,record_submitter,record_year,record_link,record_current_label,label_changes])
    return records_lst

In [6]:
def add_nucleotide_records(records_lst):
    Entrez.email = "chenyanpeng1992@outlook.com"
    unique_term_nucleotide_count_dict = {}
    for record_lst in tqdm(records_lst,desc="query NCBI nucleotide"):
        search_nucleotide_term = record_lst[4]
        if search_nucleotide_term not in unique_term_nucleotide_count_dict:
            handle = Entrez.esearch(db="nucleotide", term=search_nucleotide_term)
            records = Entrez.read(handle)
            unique_term_nucleotide_count_dict[search_nucleotide_term] = records['Count']
            
    new_records_lst = [record_lst + [unique_term_nucleotide_count_dict[record_lst[4]]] for record_lst in records_lst]
    return new_records_lst

# Useage by an example

In [13]:
s = Service(r'msedgedriver.exe')
option = webdriver.EdgeOptions()

# hide search window
#option.add_argument('--headless')

# active Edge searching engine and 5s for the browser to load the data
#driver = webdriver.Edge(service=s, options = option)
driver = webdriver.Edge(service=s)
driver.implicitly_wait(5)

# to indexfungorum mainpage
open_indexfungorum()

# to search page
to_search_page()

In [14]:
# input a search term
search_term = 'Xylaria'

In [15]:
# obtain information
all_records_lst = search_record(search_term)
records_lst = parse_each_record_line(all_records_lst)
driver.close()

Number of pages   : 5
Number of records : 878


In [16]:
# check whether each record has sequence available in the NCBI nucleotide database
new_records_lst = add_nucleotide_records(records_lst)

query NCBI nucleotide: 100%|█████████████████████████████████████████████████████████| 878/878 [14:03<00:00,  1.04it/s]


In [12]:
# output to excel
headers = ['record_label','record_submitter','record_year','record_link','record_current_label','label_changes', 'count_sequence']
# Convert the nested list to a DataFrame with headers
df = pd.DataFrame(new_records_lst, columns=headers)

# Export the DataFrame to Excel
time_suffix = time.strftime('%Y%m%d')
output_file = search_term + '_indexfungorum_records_' + str(time_suffix) + '.xlsx'
df.to_excel(output_file, index=False)

# Tidy table

In [35]:
df = pd.read_excel('Fusarium_indexfungorum_records_20230807.xlsx')

In [36]:
df_accepted = df

In [40]:
df_taxa_table = pd.read_excel('D:\PhD_Thesis\chapter2\Sordariomycetes_Species\Fusarium\Fusarium_taxa_table.xlsx')

In [41]:
df_taxa_table.head()

Unnamed: 0,longLabel,Year,Isolate,Type,Country / Location,Host / Habitat,ITS,CAL,TEF,RPB1,RPB2,TUB2,IGS
0,Albonectria rigidiuscula,,LC13606 = F503,,Japan,unidentified plant,MW016388,MW566255,MW580428,MW024420,MW474374,MW533715,
1,Bisifusarium aseptatum,,CGMCC 3.20816 = LC1075 T,1.0,"China, Guangdong Province, Guangzhou city",Orchidaceae sp.,MW016389,MW566256,MW580429,MW024421,MW474375,MW533716,
2,Bisifusarium aseptatum,,LC13607,,"China, Guangdong Province, Guangzhou city",Orchidaceae sp.,MW016390,MW566257,MW580430,MW024422,MW474376,MW533717,
3,Bisifusarium aseptatum,,LC13608,,"China, Guangdong Province, Guangzhou city",Orchidaceae sp.,MW016391,MW566258,MW580431,MW024423,MW474377,MW533718,
4,Fusarium acaciae-mearnsii,,LC13786 = FJWYS2-3,,"China, Fujian Province, Fuzhou city",Musa nana,MW016630,,MW620091,MW024658,MW474616,MW533978,


In [42]:
df_to_add = df_accepted[~df_accepted['record_current_label'].isin(df_taxa_table['longLabel'])]

In [43]:
df_to_add.head()

Unnamed: 0,record_current_label,record_label,record_submitter,record_year,record_link,label_changes,count_sequence
0,Fusarium aberrans,Fusarium aberrans,"J.W. Xia, L. Lombard, Sand.-Den., X.G. Zhang a...",2019.0,http://www.indexfungorum.org/Names/NamesRecord...,0,15
2,Fusarium acaciae,Fusarium acaciae,Cooke and Harkn.,1884.0,http://www.indexfungorum.org/Names/NamesRecord...,0,148
3,Fusarium acaciae,Fusarium acaciae,Berl. and Voglino,1886.0,http://www.indexfungorum.org/Names/NamesRecord...,0,148
4,Fusarium acaciae,Fusarium acaciae,Sacc.,1891.0,http://www.indexfungorum.org/Names/NamesRecord...,0,148
6,Fusarium acicola,Fusarium acicola,Bres.,1910.0,http://www.indexfungorum.org/Names/NamesRecord...,0,0


In [44]:
df_to_add.to_excel('D:\PhD_Thesis\chapter2\Sordariomycetes_Species\Fusarium\Fusarium_to_add.xlsx')