In [9]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import re
from tqdm import tqdm
import time

# Define the search equation
search_equation = "(antibiotic resistance OR antimicrobial resistance OR AMR) AND ('deep learning' OR 'neural network' OR embedding OR interpretable OR autoencoders OR CNN OR convolutional OR LSTM OR 'long short-term memory' OR NLP OR 'Natural Language Processing' OR transformer OR BERT)"

# Function to search Google Scholar with pagination and error handling
def search_google_scholar(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://scholar.google.com/scholar?q={query}&start={page*10}&as_ylo=2023&as_yhi=2024"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('[data-lid]'):
                    title = item.select_one('.gs_rt').text
                    authors = item.select_one('.gs_a').text
                    snippet = item.select_one('.gs_rs').text
                    link = item.select_one('.gs_rt a')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'Google Scholar'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching Google Scholar page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Function to search PubMed with pagination and error handling
def search_pubmed(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://pubmed.ncbi.nlm.nih.gov/?term={query}&filter=years.2023-2024&page={page+1}"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('.docsum-content'):
                    title = item.select_one('.docsum-title').text.strip()
                    authors = item.select_one('.docsum-authors').text.strip()
                    snippet = item.select_one('.full-view-snippet').text.strip() if item.select_one('.full-view-snippet') else ''
                    link = "https://pubmed.ncbi.nlm.nih.gov" + item.select_one('.docsum-title')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'PubMed'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching PubMed page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Function to search IEEE Xplore with pagination and error handling
def search_ieee(query, num_pages=10):
    articles = []
    for page in range(num_pages):
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={query}&ranges=2023_2024_Year&pageNumber={page+1}"
        for attempt in range(3):  # Retry mechanism
            try:
                response = requests.get(url)
                response.raise_for_status()
                soup = BeautifulSoup(response.text, 'html.parser')
                for item in soup.select('.List-results-items'):
                    title = item.select_one('.title').text.strip()
                    authors = item.select_one('.author').text.strip()
                    snippet = item.select_one('.description').text.strip() if item.select_one('.description') else ''
                    link = "https://ieeexplore.ieee.org" + item.select_one('.title a')['href']
                    articles.append({'Title': title, 'Authors': authors, 'Snippet': snippet, 'Link': link, 'Source': 'IEEE Xplore'})
                break  # Exit retry loop if successful
            except Exception as e:
                print(f"Error fetching IEEE Xplore page {page+1}: {e}")
                time.sleep(2)  # Wait before retrying
    return articles

# Perform searches with pagination sequentially
google_scholar_results = search_google_scholar(search_equation)
pubmed_results = search_pubmed(search_equation)
ieee_results = search_ieee(search_equation)

# Combine results and remove duplicates
all_results = google_scholar_results + pubmed_results + ieee_results
df = pd.DataFrame(all_results).drop_duplicates(subset=['Title'])

# Function to extract references from articles
def extract_references(article_url):
    try:
        response = requests.get(article_url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        references = []
        for ref in soup.find_all('a', href=True):
            if 'doi' in ref['href']:
                references.append(ref['href'])
        return references
    except Exception as e:
        print(f"Error extracting references from {article_url}: {e}")
        return []

# Extract references for each article
df['References'] = df['Link'].apply(extract_references)

# Extract first author and other authors
df['First Author'] = df['Authors'].apply(lambda x: x.split(',')[0])
df['Other Authors'] = df['Authors'].apply(lambda x: ', '.join(x.split(',')[1:]))

# Extract year of publication
df['Year'] = df['Authors'].apply(lambda x: re.search(r'\d{4}', x).group() if re.search(r'\d{4}', x) else 'Unknown')

# Extract journal and DOI
df['Journal'] = df['Link'].apply(lambda x: x.split('/')[2] if 'doi' in x else 'Unknown')
df['DOI'] = df['Link'].apply(lambda x: x.split('/')[-1] if 'doi' in x else 'Unknown')

# Check if all links are connectable
def check_link(link):
    try:
        response = requests.get(link)
        return response.status_code == 200
    except Exception as e:
        print(f"Error checking link {link}: {e}")
        return False

df['Link Connectable'] = df['Link'].apply(check_link)

# Display the first few rows of the DataFrame
print(df[['Title', 'First Author', 'Other Authors', 'Year', 'Journal', 'DOI', 'Link Connectable']].head(50))


                                                Title  \
0   Deep learning-guided discovery of an antibioti...   
1             Childhood community-acquired pneumonia.   
2   CARD 2023: expanded curation, support for mach...   
3   Artificial intelligence, machine learning and ...   
4   Discovery of a structural class of antibiotics...   
5   Jumping into the future: overcoming pharmacoki...   
6   Antimicrobial susceptibility testing: An updat...   
7     Antibiotic resistance in bacterial communities.   
8   Detection, Genophenotypic Characterization, an...   
9   Antimicrobial resistance and machine learning:...   
10  Photomodulation Approaches to Overcome Antimic...   
11  Incidence of infection with multidrug-resistan...   
12  Cell-free biosynthesis combined with deep lear...   
13  Antimicrobial resistance in patients with COVI...   
14  Exploiting genomics for antimicrobial resistan...   
15  Antimicrobial resistance and its relationship ...   
16  Electrokinetics in antimicr

In [12]:
df.shape

df.to_excel("AMR_Deep_Learning_Prediction_Publications.xlsx")

In [11]:
df.shape


(50, 12)

In [14]:
#IEEE Xplore

# Define the search keywords
antibiotics_keywords = ["antibiotic resistance", "antimicrobial resistance", "AMR"]
ai_keywords = ["deep learning", "neural network", "embedding", "interpretable", "autoencoders", "CNN", "convolutional", "LSTM", "long short-term memory", "NLP", "Natural Language Processing", "transformer", "BERT"]

# Tokenize the keywords for easier matching
antibiotics_tokens = set(word_tokenize(" ".join(antibiotics_keywords)))
ai_tokens = set(word_tokenize(" ".join(ai_keywords)))

import pandas as pd

def generate_ieee_urls(keyword_pair, num_pages=5):
    urls = []
    for page in range(num_pages):
        query = "%20".join(keyword_pair).replace(' ', '%20')  # Join the keywords with %20 to form the query
        url = f"https://ieeexplore.ieee.org/search/searchresult.jsp?queryText={query}&highlight=true&returnType=SEARCH&matchPubs=true&ranges=2023_2024_Year&returnFacets=ALL&refinements=ContentType:Journals&pageNumber={page+1}"
        urls.append(url)
    return urls

# Generate all pairs of keywords
keyword_pairs = list(itertools.product(antibiotics_keywords, ai_keywords))

# Generate URLs for each pair of keywords
ieee_urls = [generate_ieee_urls(pair, 1) for pair in keyword_pairs]

# Flatten the list of lists
ieee_urls = [url for sublist in ieee_urls for url in sublist]

# Convert the list of URLs to a DataFrame
ieee_results = pd.DataFrame(ieee_urls, columns=['URL'])
ieee_results

Unnamed: 0,URL
0,https://ieeexplore.ieee.org/search/searchresul...
1,https://ieeexplore.ieee.org/search/searchresul...
2,https://ieeexplore.ieee.org/search/searchresul...
3,https://ieeexplore.ieee.org/search/searchresul...
4,https://ieeexplore.ieee.org/search/searchresul...
5,https://ieeexplore.ieee.org/search/searchresul...
6,https://ieeexplore.ieee.org/search/searchresul...
7,https://ieeexplore.ieee.org/search/searchresul...
8,https://ieeexplore.ieee.org/search/searchresul...
9,https://ieeexplore.ieee.org/search/searchresul...


In [15]:
ieee_results.to_csv('ieee_urls.csv')