In [54]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import time

# Define the search equation
search_equation = "(antibiotic resistance OR antimicrobial resistance OR AMR) AND ('deep learning' OR 'neural network' OR embedding OR interpretable OR autoencoders OR CNN OR convolutional OR LSTM OR 'long short-term memory' OR NLP OR 'Natural Language Processing' OR transformer OR BERT)"

# Function to search Google Scholar with pagination and error handling
def search_google_scholar(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://scholar.google.com/scholar?q={query}&start={page*10}&as_ylo=2023&as_yhi=2024"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('[data-lid]'):
                    title = item.select_one('.gs_rt').text
                    authors = item.select_one('.gs_a').text
                    snippet = item.select_one('.gs_rs').text
                    link = item.select_one('.gs_rt a')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'Google Scholar'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching Google Scholar page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Function to search PubMed with pagination and error handling
def search_pubmed(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://pubmed.ncbi.nlm.nih.gov/?term={query}&filter=years.2023-2024&page={page+1}"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('.docsum-content'):
                    title = item.select_one('.docsum-title').text.strip()
                    authors = item.select_one('.docsum-authors').text.strip()
                    snippet = item.select_one('.full-view-snippet').text.strip() if item.select_one('.full-view-snippet') else ''
                    link = "https://pubmed.ncbi.nlm.nih.gov" + item.select_one('.docsum-title')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'PubMed'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching PubMed page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Function to search IEEE Xplore with pagination and error handling
def search_ieee(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={query}&ranges=2023_2024_Year&pageNumber={page+1}"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('.List-results-items'):
                    title = item.select_one('.title').text.strip()
                    authors = item.select_one('.author').text.strip()
                    snippet = item.select_one('.description').text.strip() if item.select_one('.description') else ''
                    link = "https://ieeexplore.ieee.org" + item.select_one('.title a')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'IEEE Xplore'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching IEEE Xplore page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Perform searches with pagination sequentially
google_scholar_results = search_google_scholar(search_equation)
pubmed_results = search_pubmed(search_equation)
ieee_results = search_ieee(search_equation)

# Combine results and remove duplicates
all_results = google_scholar_results + pubmed_results + ieee_results
df = pd.DataFrame(all_results).drop_duplicates(subset=['Title'])

# Function to extract references from articles
def extract_references(article_url):
    try:
        response = requests.get(article_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        references = []
        for ref in soup.find_all('a', href=True):
            if 'doi' in ref['href']:
                references.append(ref['href'])
        return references
    except Exception as e:
        print(f"Error extracting references from {article_url}: {e}")
        return []

# Extract references for each article
df['References'] = df['Link'].apply(extract_references)

# Extract first author and other authors
df['First Author'] = df['Authors'].apply(lambda x: x.split(',')[0])
df['Other Authors'] = df['Authors'].apply(lambda x: ', '.join(x.split(',')[1:]))

# Extract year of publication
df['Year'] = df['Authors'].apply(lambda x: re.search(r'\d{4}', x).group() if re.search(r'\d{4}', x) else 'Unknown')

# Extract journal and DOI
df['Journal'] = df['Link'].apply(lambda x: x.split('/')[2] if 'doi' in x else 'Unknown')
df['DOI'] = df['Link'].apply(lambda x: x.split('/')[-1] if 'doi' in x else 'Unknown')

# Check if all links are connectable
def check_link(link):
    try:
        response = requests.get(link)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking link {link}: {e}")
        return False

df['Link Connectable'] = df['Link'].apply(check_link)

# Display the first few rows of the DataFrame
print(df[['Title', 'First Author', 'Other Authors', 'Year', 'Journal', 'DOI', 'Link Connectable']].head(50))


KeyboardInterrupt: 

In [None]:
df.shape

df.to_excel("AMR_Deep_Learning_Prediction_Publications.xlsx")

In [55]:
df.shape


(200, 11)

In [56]:
from nltk.tokenize import word_tokenize
import itertools

#IEEE Xplore

# Define the search keywords
antibiotics_keywords = ["antibiotic resistance", "antimicrobial resistance", "AMR"]
ai_keywords = ["deep learning", "neural network", "embedding", "interpretable", "autoencoders", "CNN", "convolutional", "LSTM", "long short-term memory", "NLP", "Natural Language Processing", "transformer", "BERT"]

# Tokenize the keywords for easier matching
antibiotics_tokens = set(word_tokenize(" ".join(antibiotics_keywords)))
ai_tokens = set(word_tokenize(" ".join(ai_keywords)))

import pandas as pd

def generate_ieee_urls(keyword_pair, num_pages=5):
    urls = []
    for page in range(num_pages):
        query = "%20".join(keyword_pair).replace(' ', '%20')  # Join the keywords with %20 to form the query
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={query}&highlight=true&returnType=SEARCH&matchPubs=true&ranges=2010_2024_Year&returnFacets=ALL&refinements=ContentType:Journals&pageNumber={page+1}"
        urls.append(url)
    return urls

# Generate all pairs of keywords
keyword_pairs = list(itertools.product(antibiotics_keywords, ai_keywords))

# Generate URLs for each pair of keywords
ieee_urls = [generate_ieee_urls(pair, 1) for pair in keyword_pairs]

# Flatten the list of lists
ieee_urls = [url for sublist in ieee_urls for url in sublist]

# Convert the list of URLs to a DataFrame
ieee_urls = pd.DataFrame(ieee_urls, columns=['URL'])

In [57]:
ieee_urls

Unnamed: 0,URL
0,https://ieeexplore.ieee.org/search/searchresul...
1,https://ieeexplore.ieee.org/search/searchresul...
2,https://ieeexplore.ieee.org/search/searchresul...
3,https://ieeexplore.ieee.org/search/searchresul...
4,https://ieeexplore.ieee.org/search/searchresul...
5,https://ieeexplore.ieee.org/search/searchresul...
6,https://ieeexplore.ieee.org/search/searchresul...
7,https://ieeexplore.ieee.org/search/searchresul...
8,https://ieeexplore.ieee.org/search/searchresul...
9,https://ieeexplore.ieee.org/search/searchresul...


In [59]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from tqdm import tqdm
import pandas as pd
import re

def parse_links_selenium(url):
    # Setup the webdriver
    webdriver_service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-popups")
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3')


    driver = webdriver.Chrome(service=webdriver_service, options=options)

    # Get the page
    driver.get(url)

    # Find all the <a> tags
    a_tags = driver.find_elements(By.TAG_NAME, 'a')

    # Extract the href attribute from each <a> tag
    links = [a.get_attribute('href') for a in a_tags if a.get_attribute('href') is not None]

    # Filter the links
    links = [re.match(r'(https://ieeexplore.ieee.org/document/\d+)/.*', link).group(1) 
             for link in links 
             if re.match(r'https://ieeexplore.ieee.org/document/\d+', link)]

    # Close the driver
    driver.quit()
    return links

# Create a list to store the results
results = []

#  Apply the parse_links_selenium function to each URL in the dataframe
for url in tqdm(ieee_urls['URL']):
    links = parse_links_selenium(url)
    for link in links:
        results.append({'URL': link})

# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Remove duplicates
results_df = results_df.drop_duplicates()

results_df.head()

  3%|▎         | 1/39 [00:03<02:09,  3.42s/it]

In [None]:
ieee_results = results_df
ieee_results.shape

(19, 1)

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import pandas as pd
from datetime import datetime


def parse_info_selenium(url):
    # Check that the URL is valid
    if url is None or not isinstance(url, str) or not url.startswith('http'):
        return pd.Series([None]*6)

    # Setup the webdriver
    webdriver_service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument("--disable-popups")
    options.add_argument("--headless")  # Run in headless mode
    options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')
    #options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0')
    #options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.48')
    #options.add_argument('user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15')
    #options.add_argument('user-agent=Mozilla/5.0 (Linux; Android 10; SM-A205U) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36')
    #options.add_argument('user-agent=Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Mobile/15E148 Safari/604.1')

    driver = webdriver.Chrome(service=webdriver_service, options=options)

    # Get the page
    driver.get(url)

    # Create a WebDriverWait object
    wait = WebDriverWait(driver, 10)  # Wait for up to 10 seconds

    # Extract the required information
    try:
        journal = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a.stats-document-abstract-publishedIn'))).text
    except:
        journal = None

    try:
        doi = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[href^="https://doi.org/"]'))).text
    except:
        doi = None
        
    try:
        publication_div = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.u-pb-1.doc-abstract-pubdate')))
        date_string = publication_div.text.split("Date of Publication:")[1].strip()
        date_object = datetime.strptime(date_string, "%d %B %Y")
        date_of_publication = date_object.strftime("%Y-%m-%d")
    except:
        date_of_publication = None

    try:
        first_author = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[triggers="hover"] span'))).text
    except:
        first_author = None

    try:
        title = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'h1.document-title'))).text
    except:
        title = None

    try:
        abstract = wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, 'div.abstract-text'))).text
    except:
        abstract = None

    # Close the driver
    driver.quit()
    #print("Trail:", journal, doi, date_of_publication, first_author, title, abstract)
    return pd.Series([journal, doi, date_of_publication, first_author, title, abstract])

# Apply the parse_info_bs4 function to each URL in the dataframe
info_df = ieee_results['URL'].apply(lambda url: tqdm(parse_info_selenium(url)))

# Rename the columns
info_df.columns = ['Journal', 'DOI', 'Date of Publication', 'First Author', 'Title', 'Abstract']

# Concatenate the original dataframe with the new information
final_df = pd.concat([ieee_results, info_df], axis=1)

final_df

Unnamed: 0,URL,Journal,DOI,Date of Publication,First Author,Title,Abstract
0,https://ieeexplore.ieee.org/document/10081356,IEEE Journal of Biomedical and Health Informatics,10.1109/JBHI.2023.3261319,2023-03-27,Yue Wang,An Effective Model for Predicting Phage-Host I...,Abstract:\nIn the treatment of bacterial infec...
5,https://ieeexplore.ieee.org/document/8804188,IEEE Access,10.1109/ACCESS.2019.2935911,2019-08-16,C. Brandon Ogbunugafor,Genetic Background Modifies the Topography of ...,Abstract:\nUnderstanding the forces that drive...
10,https://ieeexplore.ieee.org/document/10164211,IEEE Journal of Biomedical and Health Informatics,10.1109/JBHI.2023.3290014,2023-06-27,Zhen Cui,DeepTPpred: A Deep Learning Approach With Matr...,Abstract:\nThe abuse of traditional antibiotic...
15,https://ieeexplore.ieee.org/document/10185002,IEEE Access,10.1109/ACCESS.2023.3296221,2023-07-17,Mukunthan Tharmakulasingam,TransAMR: An Interpretable Transformer Model f...,Abstract:\nAntimicrobial Resistance (AMR) is a...
20,https://ieeexplore.ieee.org/document/9585364,IEEE/ACM Transactions on Computational Biology...,10.1109/TCBB.2021.3122183,2021-10-26,Zhen Cui,RMSCNN: A Random Multi-Scale Convolutional Neu...,Abstract:\nThe abuse of traditional antibiotic...
25,https://ieeexplore.ieee.org/document/9187776,IEEE Access,10.1109/ACCESS.2020.3022829,2020-09-08,Nairveen Ali,Predictive Modeling of Antibiotic Susceptibili...,Abstract:\nThe antibiotic resistance of bacter...
31,https://ieeexplore.ieee.org/document/8962015,IEEE Sensors Journal,10.1109/JSEN.2020.2967058,2020-01-17,Rafael Iriya,Rapid Antibiotic Susceptibility Testing Based ...,Abstract:\nAntibiotic resistance is an increas...
37,https://ieeexplore.ieee.org/document/9928273,IEEE Access,10.1109/ACCESS.2022.3216896,2022-10-25,Mukunthan Tharmakulasingam,Explainable Deep Learning Approach for Multila...,Abstract:\nPredicting Antimicrobial Resistance...
48,https://ieeexplore.ieee.org/document/10110982,IEEE Journal of Biomedical and Health Informatics,10.1109/JBHI.2023.3271611,2023-04-28,Ritesh Sharma,Artificial Intelligence-Based Model for Predic...,Abstract:\nIn response to environmental threat...
53,https://ieeexplore.ieee.org/document/9541099,IEEE Journal of Biomedical and Health Informatics,10.1109/JBHI.2021.3113700,2021-09-20,Lin Deng,Scale-Adaptive Deep Model for Bacterial Raman ...,Abstract:\nThe combination of Raman spectrosco...


In [None]:
final_df.to_csv("ieee_final.csv")