In [14]:
import mysql.connector as sql
import pandas as pd
import matplotlib.pyplot as plt
import requests
from bs4 import BeautifulSoup
from time import sleep
import time

In [18]:
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import os

def get_protein_locations_from_file(file_path, uniprot_id_column='uniprot'):
    """
    Reads a CSV or TSV file containing UniProt IDs, retrieves the subcellular locations
    for each unique UniProt ID from the UniProt website, and returns a DataFrame with the
    UniProt ID and its corresponding subcellular locations.

    Parameters:
    file_path (str): Path to the file with a column 'UniProt_ID'.

    Returns:
    pd.DataFrame: A DataFrame with columns 'uni_prot_id' and 'location' containing the
                  UniProt ID and its subcellular locations, respectively.
    """
    # Determine file separator based on file extension
    _, file_extension = os.path.splitext(file_path)
    sep = '\t' if file_extension.lower() == '.tsv' else ','

    # Load the file
    final_df = pd.read_csv(file_path, sep=sep)

    # Dictionary to store locations for each UniProt ID
    get_cell_location = {}

    # Process each unique UniProt ID
    for count, uniprot_id in enumerate(set(final_df[uniprot_id_column]), 1):        
        # Fetch the XML data from UniProt
        time.sleep(1)  # Pause to avoid overwhelming the server
        url = f"https://www.uniprot.org/uniprot/{uniprot_id}.xml"
        response = requests.get(url)
        soup = BeautifulSoup(response.content, 'lxml-xml')
        
        # Extract subcellular locations
        subcellular_locations = soup.findAll('subcellularLocation')
        locations = [location.get_text(separator=' ', strip=True) for location in subcellular_locations]
        
        # Join locations into a single string
        get_cell_location[uniprot_id] = ', '.join(locations)

    # Create DataFrame from results
    df_pos = pd.DataFrame(list(get_cell_location.items()), columns=['uni_prot_id', 'location'])
    
    return df_pos


In [None]:
df_drug_central = get_protein_locations_from_file('tchem_drugs_05122020.tsv')
df_drug_central.to_csv('drug_central_sub_location.csv', index=False)

df_chembl = get_protein_locations_from_file('dataset/chembl.csv')
df_chembl.to_csv('chembl_sub_location.csv', index=False)

df_pharos = get_protein_locations_from_file('dataset/pharos.csv')
df_pharos.to_csv('pharos_sub_location.csv', index=False)
