In [1]:
import requests
import csv
import pandas as pd


input_file = "/proteins_list.xlsx"  # your file path
output_file = "phosphorylation_sites_from_proteins.csv"

# Base URLs

search_url = "https://rest.uniprot.org/uniprotkb/search"
features_url = "https://www.ebi.ac.uk/proteins/api/features/"

# Function to fetch UniProt ID by protein name

def fetch_uniprot_id(protein_name):
    params = {
        'query': protein_name,
        'fields': 'accession,id',
        'format': 'json'
    }
    response = requests.get(search_url, params=params)
    if response.status_code == 200:
        results = response.json().get('results', [])
        if results:
            return results[0].get('primaryAccession')  # Return the first result's UniProt accession
    print(f"Error fetching UniProt ID for {protein_name}: {response.status_code}")
    return None

# Function to fetch PTM data using UniProt ID

def fetch_ptm_data(uniprot_id):
    url = f"{features_url}{uniprot_id}"
    headers = {'Accept': 'application/json'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        try:
            return response.json()
        except ValueError:
            print(f"Invalid JSON response for {uniprot_id}")
            return None
    else:
        print(f"Error fetching PTM data for {uniprot_id}: {response.status_code}")
        return None

# Read the list of protein names from the Excel file if you are using excel sheet or edit it however you prefer

proteins_df = pd.read_excel(input_file)
protein_names = proteins_df.iloc[:, 0].tolist()  

# Main loop for protein names

all_phosphorylation_data = []

for protein_name in protein_names:
    full_protein_name = f"{protein_name}_HUMAN"
    uniprot_id = fetch_uniprot_id(full_protein_name)
    if uniprot_id:
        ptm_data = fetch_ptm_data(uniprot_id)
        if ptm_data and 'features' in ptm_data:  # Check if 'features' key exists
            features = ptm_data['features']
            for feature in features:
                if feature.get('type') == 'MOD_RES' and 'phospho' in feature.get('description', '').lower():
                    all_phosphorylation_data.append({
                        "Protein Name": protein_name,
                        "UniProt ID": uniprot_id,
                        "Description": feature.get('description'),
                        "Position": feature.get('begin'),
                        "Evidence": feature.get('evidences', [])
                    })

# Save results to CSV

with open(output_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=["Protein Name", "UniProt ID", "Description", "Position", "Evidence"])
    writer.writeheader()
    writer.writerows(all_phosphorylation_data)

print(f"Phosphorylation data saved to {output_file}")


Phosphorylation data saved to phosphorylation_sites_from_proteins.csv
