In [None]:
import time
from selenium import webdriver
from selenium.common.exceptions import NoSuchElementException, TimeoutException, ElementClickInterceptedException
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait
from webdriver_manager.chrome import ChromeDriverManager

from bs4 import BeautifulSoup
import re
import pandas as pd

In [None]:
df = pd.read_excel("./Data.csv")
df

Unnamed: 0,Epitope
0,FNCLGMSNRDFLEGVSGATW
1,APTRVVAAEM
2,TPRMCTREEF
3,GLDFSDLYY
4,ALVEFKDAHAKRQTV
...,...
121,LLLGHGPIRMVLAILAFLRF
122,LSHLMGRREEGATIGFSMDI
123,MRNKGIGKMGFGMVTLGASA
124,MVLAILAFLRFTAIKPSLGL


In [None]:
def extract_data(paragraph, i):
    
    # Extract the desired information using regular expressions
    aa_count = int(re.search(r"Number of amino acids: (\d+)", paragraph).group(1))
    
    mw = float(re.search(r"Molecular weight: ([\d.]+)", paragraph).group(1))
    pi = float(re.search(r"Theoretical pI: ([\d.]+)", paragraph).group(1))

    # Print the extracted information
    print(f"Number of amino acids: {aa_count}")
    print(f"Molecular weight: {mw}")
    print(f"Theoretical pI: {pi}")

    colheads = ['mw', 'aa_count', 'pi']
    coldata = [mw, aa_count, pi]

    # Extract the amino acid composition using regular expressions
    aa_composition = {}
    aa_pattern = r"([A-Z][a-z]{2}) \(([A-Z])\) +(\d+) +([\d.]+)%"
    aa_matches = re.findall(aa_pattern, paragraph)
    for aa in aa_matches:
        aa_code = aa[1]
        aa_count = int(aa[2])
        aa_percentage = float(aa[3])
        aa_composition[aa_code] = {"percentage": aa_percentage, "count": aa_count}
        colheads.append(aa_code)
        coldata.append(aa_percentage)


    # Print the amino acid composition
    print("Amino acid composition:")
    for aa in aa_composition:
        print(f"{aa}: percentage = {aa_composition[aa]['percentage']}%")

    

    neg_charge = int(aa_composition['D']['count']) + int(aa_composition['E']['count'])
    pos_charge = int(aa_composition['R']['count']) + int(aa_composition['K']['count'])

    
    

    coldata.append(neg_charge)
    coldata.append(pos_charge)

    colheads.append('total_neg_res')
    # coldata.append(total_neg_charged_res)
    colheads.append('total_pos_res')
    # coldata.append(total_pos_charged_res)


    # Print the total number of negatively charged and positively charged residues
    print(f"Total number of negatively charged residues (Asp + Glu): {neg_charge}")
    print(f"Total number of positively charged residues (Arg + Lys): {pos_charge}")

    # Extract the important data using regular expressions
    atom_composition = {}
    atom_pattern = r"([A-Z][a-z]*) +([A-Z]+) +(\d+)"
    atom_matches = re.findall(atom_pattern, paragraph)
    for atom in atom_matches:
        atom_name = atom[0]
        atom_symbol = atom[1]
        atom_count = int(atom[2])
        atom_composition[atom_name] = {"symbol": atom_symbol, "count": atom_count}
        colheads.append(atom_name+' count')
        coldata.append(atom_count)

    # Print the atomic composition
    print(f"Atomic composition:")
    for atom in atom_composition:
        print(f"{atom_composition[atom]['symbol']} {atom_composition[atom]['count']}")

    # Extract the important data using regular expressions
    total_atoms_pattern = r"Total number of atoms: (\d+)"
    total_atoms_matches = re.findall(total_atoms_pattern, paragraph)
    total_atoms = int(total_atoms_matches[0])

    colheads.append('total_atoms')
    coldata.append(total_atoms)

    # Print the total number of atoms
    print(f"Total number of atoms: {total_atoms}")

    pattern = r"Ext\.? coeff(?:icient)?\s+(\d+\.\d+)\s+"

    # Search for matches in the text
    ext_coeff_matches = re.findall(pattern, paragraph)

    colheads.append('ext_coeff')
    
    if ext_coeff_matches:
        print("Extinction coefficients found:")
        for match in ext_coeff_matches:
            print(match)
            coldata.append(match)
    else:
        coldata.append('NIL')
        print("No extinction coefficients found.")

    mammalian_reticulocytes_pattern = r"(\d+(?:\.\d+)?) hours \(mammalian reticulocytes, in vitro\)"
    mammalian_reticulocytes_match = re.search(mammalian_reticulocytes_pattern, paragraph)

    colheads.append('half_life')
    if mammalian_reticulocytes_match:
        mammalian_reticulocytes_half_life = float(mammalian_reticulocytes_match.group(1))
        print("The half-life in mammalian reticulocytes is:", mammalian_reticulocytes_half_life, "hours.")
        coldata.append(mammalian_reticulocytes_half_life)
    else:
        coldata.append('NIL')
        print("No half-life information found for mammalian reticulocytes.")

    instability_regex = r"instability index \(II\) is computed to be ([\d\.]+)"
    aliphatic_regex = r"Aliphatic index: ([\d\.]+)"
    gravy_regex = r"Grand average of hydropathicity \(GRAVY\): ([\d\.-]+)"

    # Extract values using regular expressions
    instability_match = re.search(instability_regex, paragraph)
    aliphatic_match = re.search(aliphatic_regex, paragraph)
    gravy_match = re.search(gravy_regex, paragraph)
    
    colheads.append('instability_index')
    # Print results
    if instability_match:
        instability_index = float(instability_match.group(1))
        coldata.append(instability_index)
        print(f"Instability index: {instability_index}")
    else:
        coldata.append('NIL')
        print("No instability index found.")

    colheads.append('aliphatic_index')
    if aliphatic_match:
        aliphatic_index = float(aliphatic_match.group(1))
        print(f"Aliphatic index: {aliphatic_index}")
        coldata.append(aliphatic_index)
    else:
        coldata.append('NIL')
        print("No Aliphatic index found.")
    
    colheads.append('GRAVY')
    if gravy_match:
        gravy_match = float(gravy_match.group(1))
        coldata.append(gravy_match)
        print(f"Gravy Match: {gravy_match}")
    else:
        coldata.append('NIL')
        print("No Gravy Match found.")

    
    
    
    df.loc[i, colheads] = coldata

In [None]:
url = "https://web.expasy.org/protparam/"

In [None]:

# df1=df[['Epitope']]

for i,value in enumerate(df.values):
    sequence = value[0]
    driver = webdriver.Chrome(ChromeDriverManager().install())
    wait = WebDriverWait(driver, 10)
    driver.get('https://web.expasy.org/protparam/')

    seq_field = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="sib_body"]/form/textarea')))

    seq_field.send_keys(sequence)


    submit_btn = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="sib_body"]/form/p[1]/input[2]')))
    submit_btn.click()

    data_div = wait.until(EC.presence_of_element_located((By.XPATH, '//*[@id="sib_body"]/pre[2]')))
    time.sleep(10)

    extract_data(data_div.text , i)    
    driver.close()

  driver = webdriver.Chrome(ChromeDriverManager().install())


Number of amino acids: 20
Molecular weight: 2204.46
Theoretical pI: 4.37
Amino acid composition:
A: percentage = 5.0%
R: percentage = 5.0%
N: percentage = 10.0%
D: percentage = 5.0%
C: percentage = 5.0%
Q: percentage = 0.0%
E: percentage = 5.0%
G: percentage = 15.0%
H: percentage = 0.0%
I: percentage = 0.0%
L: percentage = 10.0%
K: percentage = 0.0%
M: percentage = 5.0%
F: percentage = 10.0%
P: percentage = 0.0%
S: percentage = 10.0%
T: percentage = 5.0%
W: percentage = 5.0%
Y: percentage = 0.0%
V: percentage = 5.0%
O: percentage = 0.0%
U: percentage = 0.0%
Total number of negatively charged residues (Asp + Glu): 2
Total number of positively charged residues (Arg + Lys): 1
Atomic composition:
C 96
H 142
N 26
O 30
S 2
Total number of atoms: 296
No extinction coefficients found.
The half-life in mammalian reticulocytes is: 1.1 hours.
Instability index: 24.13
Aliphatic index: 58.5
Gravy Match: 0.035


  driver = webdriver.Chrome(ChromeDriverManager().install())


NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=111.0.5563.65)
Stacktrace:
Backtrace:
	(No symbol) [0x00E9DCE3]
	(No symbol) [0x00E339D1]
	(No symbol) [0x00D44DA8]
	(No symbol) [0x00D2D0D3]
	(No symbol) [0x00D8EA8B]
	(No symbol) [0x00D9D093]
	(No symbol) [0x00D8ACC6]
	(No symbol) [0x00D66F68]
	(No symbol) [0x00D680CD]
	GetHandleVerifier [0x01113832+2506274]
	GetHandleVerifier [0x01149794+2727300]
	GetHandleVerifier [0x0114E36C+2746716]
	GetHandleVerifier [0x00F46690+617600]
	(No symbol) [0x00E3C712]
	(No symbol) [0x00E41FF8]
	(No symbol) [0x00E420DB]
	(No symbol) [0x00E4C63B]
	BaseThreadInitThunk [0x76A97D69+25]
	RtlInitializeExceptionChain [0x7760BB9B+107]
	RtlClearBits [0x7760BB1F+191]


In [None]:
df.to_csv('./test1.csv')