In [10]:
from nltk.tokenize import word_tokenize
import itertools

#IEEE Xplore

# Define the search keywords
antibiotics_keywords = ["antibiotic resistance", "antimicrobial resistance", "AMR"]
ai_keywords = ["deep learning", "neural network", "embedding", "interpretable", "autoencoders", "CNN", "convolutional", "LSTM", "long short-term memory", "NLP", "Natural Language Processing", "transformer", "BERT"]

# Tokenize the keywords for easier matching
antibiotics_tokens = set(word_tokenize(" ".join(antibiotics_keywords)))
ai_tokens = set(word_tokenize(" ".join(ai_keywords)))

import pandas as pd

def generate_ieee_urls(keyword_pair, num_pages=5):
    urls = []
    for page in range(num_pages):
        query = "%20".join(keyword_pair).replace(' ', '%20')  # Join the keywords with %20 to form the query
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={query}&highlight=true&returnType=SEARCH&matchPubs=true&ranges=2010_2024_Year&returnFacets=ALL&refinements=ContentType:Journals&pageNumber={page+1}"
        urls.append(url)
    return urls

# Generate all pairs of keywords
keyword_pairs = list(itertools.product(antibiotics_keywords, ai_keywords))

# Generate URLs for each pair of keywords
ieee_urls = [generate_ieee_urls(pair, 1) for pair in keyword_pairs]

# Flatten the list of lists
ieee_urls = [url for sublist in ieee_urls for url in sublist]

# Convert the list of URLs to a DataFrame
ieee_urls = pd.DataFrame(ieee_urls, columns=['URL'])

In [11]:
ieee_urls.shape

(39, 1)

In [12]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from tqdm import tqdm
import pandas as pd
import re
import random, time

def parse_links_selenium(url):
    # Setup the webdriver
    webdriver_service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-popups")
    options.add_argument("--headless")  # Run in headless mode
    
    # List of user-agent strings
    user_agents = [
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.48',
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'user-agent=Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36',
        'user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1'
    ]

    # Use the current time as the seed
    random.seed(time.time())

    # Randomly select a user-agent string
    user_agent = random.choice(user_agents)

    # Add the user-agent string to the options
    options.add_argument(user_agent)


    driver = webdriver.Chrome(service=webdriver_service, options=options)

    # Get the page
    driver.get(url)

    # Find all the <a> tags
    a_tags = driver.find_elements(By.TAG_NAME, 'a')

    # Extract the href attribute from each <a> tag
    links = [a.get_attribute('href') for a in a_tags if a.get_attribute('href') is not None]

    # Filter the links
    links = [re.match(r'(https://ieeexplore.ieee.org/document/\d+)/.*', link).group(1) 
             for link in links 
             if re.match(r'https://ieeexplore.ieee.org/document/\d+', link)]

    # Close the driver
    driver.quit()
    return links

# Create a list to store the results
results = []

#  Apply the parse_links_selenium function to each URL in the dataframe
for url in tqdm(ieee_urls['URL']):
    links = parse_links_selenium(url)
    for link in links:
        results.append({'URL': link})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Remove duplicates
results_df = results_df.drop_duplicates()

results_df.head()

100%|██████████| 39/39 [02:05<00:00,  3.23s/it]


Unnamed: 0,URL
0,https://ieeexplore.ieee.org/document/10110982
5,https://ieeexplore.ieee.org/document/10164211
10,https://ieeexplore.ieee.org/document/10185002


In [13]:
ieee_results = results_df
ieee_results.shape

(3, 1)

In [14]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import pandas as pd
from datetime import datetime
import random
import time

def parse_info_selenium(url):
    # Check that the URL is valid
    if url is None or not isinstance(url, str) or not url.startswith('http'):
        return pd.Series([None]*6)

    # Setup the webdriver
    webdriver_service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-popups")
    options.add_argument("--headless")  # Run in headless mode

    # List of user-agent strings
    user_agents = [
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.48',
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'user-agent=Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36',
        'user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1'
    ]

    # Use the current time as the seed
    random.seed(time.time())

    # Randomly select a user-agent string
    user_agent = random.choice(user_agents)

    # Add the user-agent string to the options
    options.add_argument(user_agent) 

    driver = webdriver.Chrome(service=webdriver_service, options=options)

    # Get the page
    driver.get(url)

    # Create a WebDriverWait object
    wait = WebDriverWait(driver, 10)  # Wait for up to 10 seconds

    # Extract the required information
    try:
        journal = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.stats-document-abstract-publishedIn'))).text
    except:
        journal = None

    try:
        doi = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://doi.org/"]'))).text
    except:
        doi = None
        
    try:
        publication_div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.u-pb-1.doc-abstract-pubdate')))
        date_string = publication_div.text.split("Date of Publication:")[1].strip()
        date_object = datetime.strptime(date_string, "%d %B %Y")
        date_of_publication = date_object.strftime("%Y-%m-%d")
    except:
        date_of_publication = None

    try:
        first_author = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[triggers="hover"] span'))).text
    except:
        first_author = None

    try:
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.document-title'))).text
    except:
        title = None

    try:
        abstract = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.abstract-text'))).text
    except:
        abstract = None

    # Close the driver
    driver.quit()
    return pd.Series([journal, doi, date_of_publication, first_author, title, abstract])

# Assuming ieee_results is a DataFrame with a column 'Link' containing the URLs
# Initialize an empty list to hold the results
results = []

# Use tqdm to show progress
for url in tqdm(ieee_results['URL']):
    results.append(parse_info_selenium(url))

# Convert the list of results to a DataFrame
info_df = pd.DataFrame(results, columns=['Journal', 'DOI', 'Date of Publication', 'First Author', 'Title', 'Abstract'])

# Concatenate the original dataframe with the new information
final_df = pd.concat([ieee_results, info_df], axis=1)

# Display the final DataFrame
final_df.head()


100%|██████████| 3/3 [00:17<00:00,  5.99s/it]


Unnamed: 0,URL,Journal,DOI,Date of Publication,First Author,Title,Abstract
0,https://ieeexplore.ieee.org/document/10110982,,,,,,
5,https://ieeexplore.ieee.org/document/10164211,,,,,,
10,https://ieeexplore.ieee.org/document/10185002,,,,,,
1,,,,,,,
2,,,,,,,


In [15]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import pandas as pd
from datetime import datetime
import random
import time

def parse_info_selenium(url):
    # Check that the URL is valid
    if url is None or not isinstance(url, str) or not url.startswith('http'):
        return pd.Series([None]*6)

    # Setup the webdriver
    webdriver_service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-popups")
    options.add_argument("--headless")  # Run in headless mode

    # List of user-agent strings
    user_agents = [
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
        'user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.48',
        'user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
        'user-agent=Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36',
        'user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1'
    ]

    # Use the current time as the seed
    random.seed(time.time())

    # Randomly select a user-agent string
    user_agent = random.choice(user_agents)

    # Add the user-agent string to the options
    options.add_argument(user_agent) 

    driver = webdriver.Chrome(service=webdriver_service, options=options)

    # Get the page
    driver.get(url)

    # Create a WebDriverWait object
    wait = WebDriverWait(driver, 20)  # Wait for up to 20 seconds

    # Extract the required information
    try:
        journal = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.stats-document-abstract-publishedIn'))).text
    except Exception as e:
        journal = None
        print(f"Error fetching journal for URL {url}: {e}")

    try:
        doi = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://doi.org/"]'))).text
    except Exception as e:
        doi = None
        print(f"Error fetching DOI for URL {url}: {e}")
        
    try:
        publication_div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.u-pb-1.doc-abstract-pubdate')))
        date_string = publication_div.text.split("Date of Publication:")[1].strip()
        date_object = datetime.strptime(date_string, "%d %B %Y")
        date_of_publication = date_object.strftime("%Y-%m-%d")
    except Exception as e:
        date_of_publication = None
        print(f"Error fetching date of publication for URL {url}: {e}")

    try:
        first_author = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[triggers="hover"] span'))).text
    except Exception as e:
        first_author = None
        print(f"Error fetching first author for URL {url}: {e}")

    try:
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.document-title'))).text
    except Exception as e:
        title = None
        print(f"Error fetching title for URL {url}: {e}")

    try:
        abstract = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.abstract-text'))).text
    except Exception as e:
        abstract = None
        print(f"Error fetching abstract for URL {url}: {e}")

    # Close the driver
    driver.quit()
    return pd.Series([journal, doi, date_of_publication, first_author, title, abstract])

# Assuming ieee_results is a DataFrame with a column 'Link' containing the URLs
# Initialize an empty list to hold the results
results = []

# Use tqdm to show progress
for url in tqdm(ieee_results['URL']):
    results.append(parse_info_selenium(url))

# Convert the list of results to a DataFrame
info_df = pd.DataFrame(results, columns=['Journal', 'DOI', 'Date of Publication', 'First Author', 'Title', 'Abstract'])

# Concatenate the original dataframe with the new information
final_df = pd.concat([ieee_results, info_df], axis=1)

# Display the final DataFrame
print(final_df.head())


100%|██████████| 3/3 [00:14<00:00,  4.94s/it]

                                              URL  Journal  DOI  \
0   https://ieeexplore.ieee.org/document/10110982      NaN  NaN   
5   https://ieeexplore.ieee.org/document/10164211      NaN  NaN   
10  https://ieeexplore.ieee.org/document/10185002      NaN  NaN   
1                                             NaN      NaN  NaN   
2                                             NaN      NaN  NaN   

    Date of Publication  First Author  Title  Abstract  
0                   NaN           NaN    NaN       NaN  
5                   NaN           NaN    NaN       NaN  
10                  NaN           NaN    NaN       NaN  
1                   NaN           NaN    NaN       NaN  
2                   NaN           NaN    NaN       NaN  



