## Note that the code below is only used if you are working with Kaggle!

Code used for installing and setting up browser environment

In [None]:
# # install google chrome
# !wget https://dl.google.com/linux/linux_signing_key.pub
# !sudo apt-key add linux_signing_key.pub
# !echo 'deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main' >> /etc/apt/sources.list.d/google-chrome.list
# !sudo apt-get -y update
# !sudo apt-get install -y google-chrome-stable

# # install chromedriver
# # !apt-get install -y qq unzip
# !wget -O /tmp/chromedriver.zip http://chromedriver.storage.googleapis.com/`curl -sS chromedriver.storage.googleapis.com/LATEST_RELEASE`/chromedriver_linux64.zip
# !unzip /tmp/chromedriver.zip chromedriver -d /usr/bin/

In [None]:
# # To check Google Chrome's version
# !google-chrome --version

# # To check Chrome Driver's version
# !chromedriver -v

In [None]:
# # Install Firefox
# !sudo install -d -m 0755 /etc/apt/keyrings
# !wget -q https://packages.mozilla.org/apt/repo-signing-key.gpg -O- | sudo tee /etc/apt/keyrings/packages.mozilla.org.asc > /dev/null
# !gpg -n -q --import --import-options import-show /etc/apt/keyrings/packages.mozilla.org.asc | awk '/pub/{getline; gsub(/^ +| +$/,""); if($0 == "35BAA0B33E9EB396F59CA838C0BA5CE6DC6315A3") print "\nThe key fingerprint matches ("$0").\n"; else print "\nVerification failed: the fingerprint ("$0") does not match the expected one.\n"}'
# !echo "deb [signed-by=/etc/apt/keyrings/packages.mozilla.org.asc] https://packages.mozilla.org/apt mozilla main" | sudo tee -a /etc/apt/sources.list.d/mozilla.list > /dev/null
# !echo 'Package: * Pin: origin packages.mozilla.org Pin-Priority: 1000' | sudo tee /etc/apt/preferences.d/mozilla
# !sudo apt-get update && sudo apt-get install firefox -y

In [None]:
# Install Microsoft Edge
!curl https://packages.microsoft.com/keys/microsoft.asc | gpg --dearmor > microsoft.gpg
!sudo install -o root -g root -m 644 microsoft.gpg /etc/apt/trusted.gpg.d/
!sudo sh -c 'echo "deb [arch=amd64] https://packages.microsoft.com/repos/edge stable main" > /etc/apt/sources.list.d/microsoft-edge-dev.list'
!sudo rm microsoft.gpg
!sudo apt update && sudo apt install microsoft-edge-stable -y

In [None]:
!sudo apt install -y python3-selenium

In [None]:
!pip install biopython
!pip install selenium
!pip install webdriver-manager

In [None]:
# !pip install tqdm
# !pip install urllib3

In [None]:
# parameters
KAGGLE_DIR = '/kaggle/working'
SRC_DIR = '/kaggle/input/dataset'
SAVE_DIR = KAGGLE_DIR + '/pdb'
SP6_PATH = SRC_DIR + '/train_set.fasta'

In [None]:
from Bio import SeqIO

def read_fasta(fasta_file=SP6_PATH):
    signal_proteins = []
    records = SeqIO.parse(fasta_file, 'fasta')
    for record in records:
        annotation = str(record.id).split('|')
#         seq = str(record.seq)[:len(str(record.seq)) // 2]
        signal_proteins.append(annotation[0])
    return signal_proteins

In [None]:
from selenium import webdriver
from selenium.webdriver.edge.options import Options
from selenium.webdriver.edge.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
# from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support.ui import WebDriverWait
# from webdriver_manager.microsoft import GeckoDriverManager
from webdriver_manager.microsoft import EdgeChromiumDriverManager

from tqdm import tqdm
import logging
import urllib.request as req
import os
import numpy as np

def crawl():
    # Read fasta file:
    prots = read_fasta(SP6_PATH)

    # Open driver
    options = Options()
#     options.headless = True
#     options = webdriver.FirefoxOptions()
    options.add_argument('--headless')
#     options.add_argument('--disable-gpu')
    options.add_argument('--no-sandbox')
    options.add_argument('--disable-dev-shm-usage')
#     chrome_options.add_argument("--window-size=1920,1080")
    driver = webdriver.Edge(service=Service(EdgeChromiumDriverManager().install()), options=options)

    error_ids = []

    # for loop to get file
    for _, prot_id in tqdm(enumerate(prots)):
        try:
            # access Uniprot page
            driver.get(f'https://www.uniprot.org/uniprotkb/{prot_id}/entry#structure')
            alphafold_url = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, "//a[text()='AlphaFold']"))
            ).get_attribute('href')

            # redirect to AlphaFold page
            driver.get(alphafold_url)
            download_url = WebDriverWait(driver, 30).until(
                EC.presence_of_element_located((By.XPATH, "//a[text()='PDB file ']"))
            ).get_attribute('href')
            
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR, exist_ok=True)

            filename = SAVE_DIR + '/' + download_url.split('/')[-1]
            req.urlretrieve(download_url, filename)

        except Exception as e:
            error_ids.append(prot_id)
            logging.exception(e)
            
    np.savetxt(KAGGLE_DIR + '/error_ids.txt', error_ids, fmt="%s")
    driver.close()

In [None]:
crawl()

In [None]:
# !pip list