In [3]:
# From Uniprot ID retireval of the basic information
import pandas as pd
import requests
import time

# Input and output file paths
input_file = "data/uniprotids.xlsx"  # Replace with your input file containing UniProt IDs
output_file = "data/uniprot_full_info.csv"  # Replace with your desired output file path

# Read the CSV file
df = pd.read_excel(input_file)

# Assuming the column containing UniProt IDs is named 'UniprotID'
column_name = 'UniprotID'

# Create a list to store results
results = []

# Iterate through each UniProt ID
for uniprot_id in df[column_name]:
    # Query the API
    url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.json"
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise exception for HTTP errors
        data = response.json()

        # Flatten the data by keeping key fields (customize as needed)
        result = {"UniprotID": uniprot_id}
        result["Protein Name"] = data.get("proteinDescription", {}).get("recommendedName", {}).get("fullName", {}).get("value", "")
        result["Gene Name"] = data.get("genes", [{}])[0].get("geneName", {}).get("value", "")
        result["Organism"] = data.get("organism", {}).get("scientificName", "")
        result["Length"] = data.get("sequence", {}).get("length", "")
        result["Sequence"] = data.get("sequence", {}).get("value", "")
        result["Function"] = "; ".join([comment.get("text", [{}])[0].get("value", "") for comment in data.get("comments", []) if comment.get("type") == "function"])

        results.append(result)
    except Exception as e:
        print(f"Error processing UniProt ID {uniprot_id}: {e}")
        results.append({"UniprotID": uniprot_id, "Error": str(e)})

    # Pause to avoid overloading the server
    time.sleep(0.1)

# Convert the list of results into a DataFrame
output_df = pd.DataFrame(results)

# Save the results to a new CSV file
output_df.to_csv(output_file, index=False)

print(f"All information saved to {output_file}")

All information saved to data/uniprot_full_info.csv
