In [1]:
# Essential imports
import re

from io import StringIO

import pandas as pd
import numpy as np

from Bio import SeqIO
from bioservices import UniProt

import tqdm

# Connect to UniProt service
u = UniProt(verbose=False)

In [2]:
df = pd.read_csv('protein-sequences.tsv', sep = '\t')

In [3]:
df

Unnamed: 0,Accession,Source Database,Name,Tax ID,Tax Name,Length
0,A0A011VQM7,unreviewed,Ammonium transporter AmtB-like domain-containi...,1341156,Ruminococcus albus SY3,606
1,A0A063ZJU6,unreviewed,Ammonium transporter AmtB-like domain-containi...,1495067,Halostagnicola sp. A56,586
2,A0A081EVR3,unreviewed,Ammonium transporter AmtB-like domain-containi...,2248,Halorubrum saccharovorum,559
3,A0A099I9L1,unreviewed,Ammonium transporter AmtB-like domain-containi...,1522,Clostridium innocuum,570
4,A0A0D8IWY3,unreviewed,Ammonium transporter AmtB-like domain-containi...,1550024,Ruthenibacterium lactatiformans,567
...,...,...,...,...,...,...
1401,W0JIB3,unreviewed,Ammonium transporter AmtB-like domain-containi...,797299,Halostagnicola larsenii XH-48,584
1402,W3ANU5,unreviewed,Ammonium transporter AmtB-like domain-containi...,1165092,Lachnospiraceae bacterium JC7,584
1403,W4V1Q8,unreviewed,Ammonium transporter AmtB-like domain-containi...,1294263,Acetivibrio straminisolvens JCM 21531,356
1404,W4V8S6,unreviewed,Ammonium transporter AmtB-like domain-containi...,1294263,Acetivibrio straminisolvens JCM 21531,584


In [4]:
accession = df['Accession'].to_list()

In [5]:
with open('Accession.txt', 'w') as fp:
    for items in accession:
        # write each item on a new line
        fp.write("%s\n" % items)
    print('Done')

Done


In [6]:

import json
import ssl
import time
from urllib import request
from urllib.error import HTTPError

BASE_URL = "https://www.ebi.ac.uk/interpro/api/entry/pfam/protein/UniProt/"

def fetch_interpro_data(uniprotid):
    """Query InterPro API for a single UniProt ID and return results as a list"""

    context = ssl._create_unverified_context()
    next_url = f"{BASE_URL}{uniprotid}"
    last_page = False
    results = []

    attempts = 0
    while next_url:
        try:
            req = request.Request(next_url, headers={"Accept": "application/json"})
            res = request.urlopen(req, context=context)

            if res.status == 408:
                time.sleep(61)
                continue
            elif res.status == 204:
                break

            payload = json.loads(res.read().decode())
            next_url = payload.get("next", None)
            attempts = 0

            if not next_url:
                last_page = True

        except HTTPError as e:
            if e.code == 408:
                time.sleep(61)
                continue
            elif attempts < 3:
                attempts += 1
                time.sleep(61)
                continue
            else:
                print(f"ERROR: {uniprotid} - {e}")
                return []

        # Append results for this UniProt ID
        results.extend(payload["results"])

        if next_url:
            time.sleep(1)  # Prevent API overload

    return results

# List of UniProt IDs to search
# extradomains = ["P69905", "P68871", "Q9Y2Z9", "P05067"]  # Replace with your actual IDs

# Dictionary to store results
interpro_results = {}

# Loop through each UniProt ID and store results
for uniprotid in accession:
    interpro_results[uniprotid] = fetch_interpro_data(uniprotid)

# Now interpro_results is a dictionary with UniProt IDs as keys and InterPro data as values

# Print the first few results for testing
for uniprotid, data in interpro_results.items():
    print(f"{uniprotid}: {data[:2]}")  # Show only first 2 entries per ID for readability

# Optionally, save results to a JSON file
with open("interpro_results.json", "w") as f:
    json.dump(interpro_results, f, indent=4)

A0A011VQM7: [{'metadata': {'accession': 'PF00543', 'name': 'Nitrogen regulatory protein P-II', 'source_database': 'pfam', 'type': 'domain', 'integrated': 'IPR002187', 'member_databases': None, 'go_terms': None}, 'proteins': [{'accession': 'a0a011vqm7', 'protein_length': 606, 'source_database': 'unreviewed', 'organism': '1341156', 'in_alphafold': True, 'entry_protein_locations': [{'fragments': [{'start': 485, 'end': 590, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': 'PF00543', 'score': 7.9e-17}]}]}, {'metadata': {'accession': 'PF00909', 'name': 'Ammonium Transporter Family', 'source_database': 'pfam', 'type': 'family', 'integrated': 'IPR024041', 'member_databases': None, 'go_terms': None}, 'proteins': [{'accession': 'a0a011vqm7', 'protein_length': 606, 'source_database': 'unreviewed', 'organism': '1341156', 'in_alphafold': True, 'entry_protein_locations': [{'fragments': [{'start': 21, 'end': 423, 'dc-status': 'CONTINUOUS'}], 'representative': False, 'model': 'PF00909', 

In [8]:
with open('interproresults.txt', 'w') as fp:
    for items in interpro_results:
        # write each item on a new line
        fp.write("%s\n" % items)
    print('Done')

Done


In [7]:
domain_info = []
name = []
id_name = []
length = []
sted = []
domain_dictionary ={}
for itom in accession:

    for item in interpro_results[itom]:
        # domain_id = item["metadata"]["proteins"]["accession"]  # Extract domain ID
        domain_name = item["metadata"]["name"]  # Extract domain name
        name.append(domain_name)
        print(domain_name)
        protein_lenth = item["proteins"][0]["protein_length"]
        length.append(protein_lenth)
        print(protein_lenth)
        str_end = item["proteins"][0]['entry_protein_locations'][0]["fragments"]
        sted.append(str_end)
        print(str_end)
        domain_info.append((f"{itom}: {domain_name} {str_end} {protein_lenth}"))
        domain_dictionary[str(itom)] = (domain_name, name, str_end, protein_lenth)
        id_name.append(f"{itom}: {name}")


Nitrogen regulatory protein P-II
606
[{'start': 485, 'end': 590, 'dc-status': 'CONTINUOUS'}]
Ammonium Transporter Family
606
[{'start': 21, 'end': 423, 'dc-status': 'CONTINUOUS'}]
Nitrogen regulatory protein P-II
586
[{'start': 476, 'end': 579, 'dc-status': 'CONTINUOUS'}]
Ammonium Transporter Family
586
[{'start': 20, 'end': 429, 'dc-status': 'CONTINUOUS'}]
Nitrogen regulatory protein P-II
559
[{'start': 449, 'end': 552, 'dc-status': 'CONTINUOUS'}]
Ammonium Transporter Family
559
[{'start': 22, 'end': 412, 'dc-status': 'CONTINUOUS'}]
Nitrogen regulatory protein P-II
570
[{'start': 456, 'end': 560, 'dc-status': 'CONTINUOUS'}]
Ammonium Transporter Family
570
[{'start': 10, 'end': 406, 'dc-status': 'CONTINUOUS'}]
Nitrogen regulatory protein P-II
567
[{'start': 453, 'end': 557, 'dc-status': 'CONTINUOUS'}]
Ammonium Transporter Family
567
[{'start': 10, 'end': 397, 'dc-status': 'CONTINUOUS'}]
Nitrogen regulatory protein P-II
565
[{'start': 452, 'end': 556, 'dc-status': 'CONTINUOUS'}]
Ammoniu

In [8]:
with open('domaininfo.txt', 'w') as fp:
    for items in domain_info:
        # write each item on a new line
        fp.write("%s\n" % items)
    print('Done')

Done


In [9]:
domain_info

["A0A011VQM7: Nitrogen regulatory protein P-II [{'start': 485, 'end': 590, 'dc-status': 'CONTINUOUS'}] 606",
 "A0A011VQM7: Ammonium Transporter Family [{'start': 21, 'end': 423, 'dc-status': 'CONTINUOUS'}] 606",
 "A0A063ZJU6: Nitrogen regulatory protein P-II [{'start': 476, 'end': 579, 'dc-status': 'CONTINUOUS'}] 586",
 "A0A063ZJU6: Ammonium Transporter Family [{'start': 20, 'end': 429, 'dc-status': 'CONTINUOUS'}] 586",
 "A0A081EVR3: Nitrogen regulatory protein P-II [{'start': 449, 'end': 552, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A081EVR3: Ammonium Transporter Family [{'start': 22, 'end': 412, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A099I9L1: Nitrogen regulatory protein P-II [{'start': 456, 'end': 560, 'dc-status': 'CONTINUOUS'}] 570",
 "A0A099I9L1: Ammonium Transporter Family [{'start': 10, 'end': 406, 'dc-status': 'CONTINUOUS'}] 570",
 "A0A0D8IWY3: Nitrogen regulatory protein P-II [{'start': 453, 'end': 557, 'dc-status': 'CONTINUOUS'}] 567",
 "A0A0D8IWY3: Ammonium Transporter Family [

In [10]:
ammonium = []
nitrogen = []

for entry in domain_info:
    if "Ammonium Transporter Family" in entry:
        ammonium.append(entry)
    elif "Nitrogen regulatory protein P-II" in entry:
        nitrogen.append(entry)

# Output results
print("Ammonium List:", ammonium)
print("Nitrogen List:", nitrogen)

Ammonium List: ["A0A011VQM7: Ammonium Transporter Family [{'start': 21, 'end': 423, 'dc-status': 'CONTINUOUS'}] 606", "A0A063ZJU6: Ammonium Transporter Family [{'start': 20, 'end': 429, 'dc-status': 'CONTINUOUS'}] 586", "A0A081EVR3: Ammonium Transporter Family [{'start': 22, 'end': 412, 'dc-status': 'CONTINUOUS'}] 559", "A0A099I9L1: Ammonium Transporter Family [{'start': 10, 'end': 406, 'dc-status': 'CONTINUOUS'}] 570", "A0A0D8IWY3: Ammonium Transporter Family [{'start': 10, 'end': 397, 'dc-status': 'CONTINUOUS'}] 567", "A0A0E2HBL2: Ammonium Transporter Family [{'start': 10, 'end': 402, 'dc-status': 'CONTINUOUS'}] 565", "A0A0F0CIE1: Ammonium Transporter Family [{'start': 10, 'end': 402, 'dc-status': 'CONTINUOUS'}] 565", "A0A0F9HQB3: Ammonium Transporter Family [{'start': 1, 'end': 405, 'dc-status': 'CONTINUOUS'}] 561", "A0A0G3WG46: Ammonium Transporter Family [{'start': 13, 'end': 403, 'dc-status': 'CONTINUOUS'}] 559", "A0A0G9LEF6: Ammonium Transporter Family [{'start': 10, 'end': 415,

In [11]:
desc = []

for record in SeqIO.parse("idmapping_2025_03_04.fasta", "fasta"):
    print(record.description)
    desc.append(record.description)

tr|A0A011VQM7|A0A011VQM7_RUMAL Adenylate cyclase OS=Ruminococcus albus SY3 OX=1341156 GN=RASY3_13175 PE=3 SV=1
tr|A0A063ZJU6|A0A063ZJU6_9EURY Ammonium transporter OS=Halostagnicola sp. A56 OX=1495067 GN=EL22_10090 PE=3 SV=1
tr|A0A081EVR3|A0A081EVR3_9EURY Ammonium transporter OS=Halorubrum saccharovorum OX=2248 GN=FK85_01485 PE=3 SV=1
tr|A0A099I9L1|A0A099I9L1_CLOIN Adenylate cyclase OS=Clostridium innocuum OX=1522 GN=CIAN88_08185 PE=3 SV=1
tr|A0A0D8IWY3|A0A0D8IWY3_9FIRM Adenylate cyclase OS=Ruthenibacterium lactatiformans OX=1550024 GN=TQ39_16150 PE=3 SV=1
tr|A0A0E2HBL2|A0A0E2HBL2_9FIRM Ammonium transporter OS=[Clostridium] clostridioforme 90A8 OX=999408 GN=HMPREF1090_02007 PE=3 SV=1
tr|A0A0F0CIE1|A0A0F0CIE1_9CLOT Ammonium transporter NrgA OS=Clostridium sp. FS41 OX=1609975 GN=nrgA PE=3 SV=1
tr|A0A0F9HQB3|A0A0F9HQB3_9ZZZZ Ammonium transporter AmtB-like domain-containing protein (Fragment) OS=marine sediment metagenome OX=412755 GN=LCGC14_1754210 PE=3 SV=1
tr|A0A0G3WG46|A0A0G3WG46_9BACT 

In [12]:
domain_info

["A0A011VQM7: Nitrogen regulatory protein P-II [{'start': 485, 'end': 590, 'dc-status': 'CONTINUOUS'}] 606",
 "A0A011VQM7: Ammonium Transporter Family [{'start': 21, 'end': 423, 'dc-status': 'CONTINUOUS'}] 606",
 "A0A063ZJU6: Nitrogen regulatory protein P-II [{'start': 476, 'end': 579, 'dc-status': 'CONTINUOUS'}] 586",
 "A0A063ZJU6: Ammonium Transporter Family [{'start': 20, 'end': 429, 'dc-status': 'CONTINUOUS'}] 586",
 "A0A081EVR3: Nitrogen regulatory protein P-II [{'start': 449, 'end': 552, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A081EVR3: Ammonium Transporter Family [{'start': 22, 'end': 412, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A099I9L1: Nitrogen regulatory protein P-II [{'start': 456, 'end': 560, 'dc-status': 'CONTINUOUS'}] 570",
 "A0A099I9L1: Ammonium Transporter Family [{'start': 10, 'end': 406, 'dc-status': 'CONTINUOUS'}] 570",
 "A0A0D8IWY3: Nitrogen regulatory protein P-II [{'start': 453, 'end': 557, 'dc-status': 'CONTINUOUS'}] 567",
 "A0A0D8IWY3: Ammonium Transporter Family [

In [13]:
disc = desc

In [14]:
id_str_end = []

for t in ammonium:
    a = t.split(':')[0]
    b = t.split(' ')[5:6]
    c = t.split(' ')[7:8]
    id_str_end.append(f"{a}: /{b}-{c}")

In [15]:
id_str_end

["A0A011VQM7: /['21,']-['423,']",
 "A0A063ZJU6: /['20,']-['429,']",
 "A0A081EVR3: /['22,']-['412,']",
 "A0A099I9L1: /['10,']-['406,']",
 "A0A0D8IWY3: /['10,']-['397,']",
 "A0A0E2HBL2: /['10,']-['402,']",
 "A0A0F0CIE1: /['10,']-['402,']",
 "A0A0F9HQB3: /['1,']-['405,']",
 "A0A0G3WG46: /['13,']-['403,']",
 "A0A0G9LEF6: /['10,']-['415,']",
 "A0A0H5SG89: /['11,']-['415,']",
 "A0A0J6WZA7: /['1,']-['383,']",
 "A0A0J6ZRU4: /['10,']-['397,']",
 "A0A0J9EMW5: /['10,']-['402,']",
 "A0A0M2NHW4: /['8,']-['398,']",
 "A0A0M6WC26: /['17,']-['412,']",
 "A0A0M6WCZ2: /['12,']-['414,']",
 "A0A0M9AS13: /['22,']-['425,']",
 "A0A0R2HL96: /['8,']-['399,']",
 "A0A0S2W112: /['9,']-['401,']",
 "A0A0U5JBG5: /['10,']-['417,']",
 "A0A0W7TMH6: /['10,']-['397,']",
 "A0A0X8VAS2: /['17,']-['411,']",
 "A0A136Q6K1: /['8,']-['398,']",
 "A0A136WGT4: /['17,']-['411,']",
 "A0A140DTE7: /['12,']-['400,']",
 "A0A143Y517: /['11,']-['404,']",
 "A0A143Y9S4: /['11,']-['404,']",
 "A0A143YNF2: /['11,']-['404,']",
 "A0A143YTA3: /['9,'

In [16]:
cleaned_data = [item.replace("[", "").replace("']", "").replace("'", "").replace(",", "") for item in id_str_end]

# Output cleaned list
print(cleaned_data)

['A0A011VQM7: /21-423', 'A0A063ZJU6: /20-429', 'A0A081EVR3: /22-412', 'A0A099I9L1: /10-406', 'A0A0D8IWY3: /10-397', 'A0A0E2HBL2: /10-402', 'A0A0F0CIE1: /10-402', 'A0A0F9HQB3: /1-405', 'A0A0G3WG46: /13-403', 'A0A0G9LEF6: /10-415', 'A0A0H5SG89: /11-415', 'A0A0J6WZA7: /1-383', 'A0A0J6ZRU4: /10-397', 'A0A0J9EMW5: /10-402', 'A0A0M2NHW4: /8-398', 'A0A0M6WC26: /17-412', 'A0A0M6WCZ2: /12-414', 'A0A0M9AS13: /22-425', 'A0A0R2HL96: /8-399', 'A0A0S2W112: /9-401', 'A0A0U5JBG5: /10-417', 'A0A0W7TMH6: /10-397', 'A0A0X8VAS2: /17-411', 'A0A136Q6K1: /8-398', 'A0A136WGT4: /17-411', 'A0A140DTE7: /12-400', 'A0A143Y517: /11-404', 'A0A143Y9S4: /11-404', 'A0A143YNF2: /11-404', 'A0A143YTA3: /9-397', 'A0A143YWY4: /9-396', 'A0A143YZV1: /9-397', 'A0A143Z212: /11-404', 'A0A143ZCA8: /10-403', 'A0A151AI36: /15-411', 'A0A169X118: /14-420', 'A0A173R3X4: /16-411', 'A0A173R662: /16-418', 'A0A173SMJ4: /17-412', 'A0A173TUR6: /5-393', 'A0A173V441: /9-405', 'A0A173W538: /20-415', 'A0A173X4T2: /12-414', 'A0A173XV88: /10-409'

In [17]:
id_str_end

["A0A011VQM7: /['21,']-['423,']",
 "A0A063ZJU6: /['20,']-['429,']",
 "A0A081EVR3: /['22,']-['412,']",
 "A0A099I9L1: /['10,']-['406,']",
 "A0A0D8IWY3: /['10,']-['397,']",
 "A0A0E2HBL2: /['10,']-['402,']",
 "A0A0F0CIE1: /['10,']-['402,']",
 "A0A0F9HQB3: /['1,']-['405,']",
 "A0A0G3WG46: /['13,']-['403,']",
 "A0A0G9LEF6: /['10,']-['415,']",
 "A0A0H5SG89: /['11,']-['415,']",
 "A0A0J6WZA7: /['1,']-['383,']",
 "A0A0J6ZRU4: /['10,']-['397,']",
 "A0A0J9EMW5: /['10,']-['402,']",
 "A0A0M2NHW4: /['8,']-['398,']",
 "A0A0M6WC26: /['17,']-['412,']",
 "A0A0M6WCZ2: /['12,']-['414,']",
 "A0A0M9AS13: /['22,']-['425,']",
 "A0A0R2HL96: /['8,']-['399,']",
 "A0A0S2W112: /['9,']-['401,']",
 "A0A0U5JBG5: /['10,']-['417,']",
 "A0A0W7TMH6: /['10,']-['397,']",
 "A0A0X8VAS2: /['17,']-['411,']",
 "A0A136Q6K1: /['8,']-['398,']",
 "A0A136WGT4: /['17,']-['411,']",
 "A0A140DTE7: /['12,']-['400,']",
 "A0A143Y517: /['11,']-['404,']",
 "A0A143Y9S4: /['11,']-['404,']",
 "A0A143YNF2: /['11,']-['404,']",
 "A0A143YTA3: /['9,'

In [18]:
updated_entries_am = []
for entry in cleaned_data:
    prefix, range_part = entry.split("/")

    # Extract the two numbers and subtract 1 from the first
    first, second = map(int, range_part.split("-"))
    first -= 1  # Subtract 1

    # Rebuild the string
    updated_entries_am.append(f"{prefix}/{first}-{second}")

# Output the updated list
for item in updated_entries_am:
    print(item)

A0A011VQM7: /20-423
A0A063ZJU6: /19-429
A0A081EVR3: /21-412
A0A099I9L1: /9-406
A0A0D8IWY3: /9-397
A0A0E2HBL2: /9-402
A0A0F0CIE1: /9-402
A0A0F9HQB3: /0-405
A0A0G3WG46: /12-403
A0A0G9LEF6: /9-415
A0A0H5SG89: /10-415
A0A0J6WZA7: /0-383
A0A0J6ZRU4: /9-397
A0A0J9EMW5: /9-402
A0A0M2NHW4: /7-398
A0A0M6WC26: /16-412
A0A0M6WCZ2: /11-414
A0A0M9AS13: /21-425
A0A0R2HL96: /7-399
A0A0S2W112: /8-401
A0A0U5JBG5: /9-417
A0A0W7TMH6: /9-397
A0A0X8VAS2: /16-411
A0A136Q6K1: /7-398
A0A136WGT4: /16-411
A0A140DTE7: /11-400
A0A143Y517: /10-404
A0A143Y9S4: /10-404
A0A143YNF2: /10-404
A0A143YTA3: /8-397
A0A143YWY4: /8-396
A0A143YZV1: /8-397
A0A143Z212: /10-404
A0A143ZCA8: /9-403
A0A151AI36: /14-411
A0A169X118: /13-420
A0A173R3X4: /15-411
A0A173R662: /15-418
A0A173SMJ4: /16-412
A0A173TUR6: /4-393
A0A173V441: /8-405
A0A173W538: /19-415
A0A173X4T2: /11-414
A0A173XV88: /9-409
A0A173XVK8: /8-397
A0A173Z4K3: /9-408
A0A173ZTK6: /8-406
A0A174C5U1: /9-402
A0A174D1D6: /9-408
A0A174GH26: /0-292
A0A174H6C4: /8-397
A0A174JG2

In [19]:
id_map = {entry.split(":")[0]: entry.split(":")[1].strip().rstrip(",") for entry in updated_entries_am}

# Process list2 to insert the /number..number after the underscore
updated_list2 = []

for entry in desc:
    parts = entry.split("|")
    if len(parts) > 2:
        protein_id = parts[1]  # Extract the ID
        id_info = parts[2]  # Extract the part that includes the second ID and underscore
        
        if protein_id in id_map:
            # Split id_info into the ID part and the next part after the underscore
            id_part, rest = id_info.split(" ", 1)
            # Rebuild id_info by inserting the number sequence after the underscore
            modified_id_info = f"{id_part}{id_map[protein_id]} {rest}"
            # Rebuild the full entry
            modified_entry = f"{parts[0]}|{parts[1]}|{modified_id_info} " + " ".join(parts[3:])
            updated_list2.append(modified_entry)
        else:
            updated_list2.append(entry)  # If no match, keep as is
    else:
        updated_list2.append(entry)  # If format is unexpected, keep as is

# Output the updated list
for item in updated_list2:
    print(item)

tr|A0A011VQM7|A0A011VQM7_RUMAL/20-423 Adenylate cyclase OS=Ruminococcus albus SY3 OX=1341156 GN=RASY3_13175 PE=3 SV=1 
tr|A0A063ZJU6|A0A063ZJU6_9EURY/19-429 Ammonium transporter OS=Halostagnicola sp. A56 OX=1495067 GN=EL22_10090 PE=3 SV=1 
tr|A0A081EVR3|A0A081EVR3_9EURY/21-412 Ammonium transporter OS=Halorubrum saccharovorum OX=2248 GN=FK85_01485 PE=3 SV=1 
tr|A0A099I9L1|A0A099I9L1_CLOIN/9-406 Adenylate cyclase OS=Clostridium innocuum OX=1522 GN=CIAN88_08185 PE=3 SV=1 
tr|A0A0D8IWY3|A0A0D8IWY3_9FIRM/9-397 Adenylate cyclase OS=Ruthenibacterium lactatiformans OX=1550024 GN=TQ39_16150 PE=3 SV=1 
tr|A0A0E2HBL2|A0A0E2HBL2_9FIRM/9-402 Ammonium transporter OS=[Clostridium] clostridioforme 90A8 OX=999408 GN=HMPREF1090_02007 PE=3 SV=1 
tr|A0A0F0CIE1|A0A0F0CIE1_9CLOT/9-402 Ammonium transporter NrgA OS=Clostridium sp. FS41 OX=1609975 GN=nrgA PE=3 SV=1 
tr|A0A0F9HQB3|A0A0F9HQB3_9ZZZZ/0-405 Ammonium transporter AmtB-like domain-containing protein (Fragment) OS=marine sediment metagenome OX=412755 G

In [20]:
fasta1 = []

for item in updated_list2:
    fasta1.append(f">{item}")

In [21]:
with open('fasta1.fasta', 'w') as fp:
    for iteme in fasta1:
        # write each item on a new line
        fp.write("%s\n" % iteme)
    print('Done')

Done


In [22]:
nitrogen

["A0A011VQM7: Nitrogen regulatory protein P-II [{'start': 485, 'end': 590, 'dc-status': 'CONTINUOUS'}] 606",
 "A0A063ZJU6: Nitrogen regulatory protein P-II [{'start': 476, 'end': 579, 'dc-status': 'CONTINUOUS'}] 586",
 "A0A081EVR3: Nitrogen regulatory protein P-II [{'start': 449, 'end': 552, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A099I9L1: Nitrogen regulatory protein P-II [{'start': 456, 'end': 560, 'dc-status': 'CONTINUOUS'}] 570",
 "A0A0D8IWY3: Nitrogen regulatory protein P-II [{'start': 453, 'end': 557, 'dc-status': 'CONTINUOUS'}] 567",
 "A0A0E2HBL2: Nitrogen regulatory protein P-II [{'start': 452, 'end': 556, 'dc-status': 'CONTINUOUS'}] 565",
 "A0A0F0CIE1: Nitrogen regulatory protein P-II [{'start': 451, 'end': 556, 'dc-status': 'CONTINUOUS'}] 565",
 "A0A0F9HQB3: Nitrogen regulatory protein P-II [{'start': 450, 'end': 554, 'dc-status': 'CONTINUOUS'}] 561",
 "A0A0G3WG46: Nitrogen regulatory protein P-II [{'start': 442, 'end': 546, 'dc-status': 'CONTINUOUS'}] 559",
 "A0A0G9LEF6: Nitro

In [23]:
id_str_end_nit = []
id_nit = []

for t in nitrogen:
    a = t.split(':')[0]
    print(a)
    b = t.split(' ')[6:7]
    print(b)
    c = t.split(' ')[8:9]
    print(c)
    id_nit.append(a)
    id_str_end_nit.append(f"{a}: /{b}-{c}")

A0A011VQM7
['485,']
['590,']
A0A063ZJU6
['476,']
['579,']
A0A081EVR3
['449,']
['552,']
A0A099I9L1
['456,']
['560,']
A0A0D8IWY3
['453,']
['557,']
A0A0E2HBL2
['452,']
['556,']
A0A0F0CIE1
['451,']
['556,']
A0A0F9HQB3
['450,']
['554,']
A0A0G3WG46
['442,']
['546,']
A0A0G9LEF6
['464,']
['569,']
A0A0H5SG89
['474,']
['578,']
A0A0J6WZA7
['426,']
['529,']
A0A0J6ZRU4
['460,']
['564,']
A0A0J9EMW5
['451,']
['556,']
A0A0M2NHW4
['460,']
['565,']
A0A0M6WC26
['468,']
['573,']
A0A0M6WCZ2
['471,']
['575,']
A0A0M9AS13
['465,']
['568,']
A0A0R2HL96
['444,']
['546,']
A0A0S2W112
['458,']
['562,']
A0A0U5JBG5
['462,']
['567,']
A0A0W7TMH6
['453,']
['557,']
A0A0X8VAS2
['470,']
['574,']
A0A136Q6K1
['450,']
['555,']
A0A136WGT4
['467,']
['571,']
A0A140DTE7
['449,']
['554,']
A0A143Y517
['467,']
['571,']
A0A143Y9S4
['482,']
['585,']
A0A143YNF2
['482,']
['585,']
A0A143YTA3
['445,']
['547,']
A0A143YWY4
['444,']
['546,']
A0A143YZV1
['445,']
['547,']
A0A143Z212
['482,']
['585,']
A0A143ZCA8
['464,']
['569,']
A0A151AI36
['4

In [24]:
id_str_end_nit

["A0A011VQM7: /['485,']-['590,']",
 "A0A063ZJU6: /['476,']-['579,']",
 "A0A081EVR3: /['449,']-['552,']",
 "A0A099I9L1: /['456,']-['560,']",
 "A0A0D8IWY3: /['453,']-['557,']",
 "A0A0E2HBL2: /['452,']-['556,']",
 "A0A0F0CIE1: /['451,']-['556,']",
 "A0A0F9HQB3: /['450,']-['554,']",
 "A0A0G3WG46: /['442,']-['546,']",
 "A0A0G9LEF6: /['464,']-['569,']",
 "A0A0H5SG89: /['474,']-['578,']",
 "A0A0J6WZA7: /['426,']-['529,']",
 "A0A0J6ZRU4: /['460,']-['564,']",
 "A0A0J9EMW5: /['451,']-['556,']",
 "A0A0M2NHW4: /['460,']-['565,']",
 "A0A0M6WC26: /['468,']-['573,']",
 "A0A0M6WCZ2: /['471,']-['575,']",
 "A0A0M9AS13: /['465,']-['568,']",
 "A0A0R2HL96: /['444,']-['546,']",
 "A0A0S2W112: /['458,']-['562,']",
 "A0A0U5JBG5: /['462,']-['567,']",
 "A0A0W7TMH6: /['453,']-['557,']",
 "A0A0X8VAS2: /['470,']-['574,']",
 "A0A136Q6K1: /['450,']-['555,']",
 "A0A136WGT4: /['467,']-['571,']",
 "A0A140DTE7: /['449,']-['554,']",
 "A0A143Y517: /['467,']-['571,']",
 "A0A143Y9S4: /['482,']-['585,']",
 "A0A143YNF2: /['482

In [25]:
cleaned_data2 = [item.replace("[", "").replace("]", "").replace("'", "").replace(",", "") for item in id_str_end_nit]

# Output cleaned list
print(cleaned_data2)

['A0A011VQM7: /485-590', 'A0A063ZJU6: /476-579', 'A0A081EVR3: /449-552', 'A0A099I9L1: /456-560', 'A0A0D8IWY3: /453-557', 'A0A0E2HBL2: /452-556', 'A0A0F0CIE1: /451-556', 'A0A0F9HQB3: /450-554', 'A0A0G3WG46: /442-546', 'A0A0G9LEF6: /464-569', 'A0A0H5SG89: /474-578', 'A0A0J6WZA7: /426-529', 'A0A0J6ZRU4: /460-564', 'A0A0J9EMW5: /451-556', 'A0A0M2NHW4: /460-565', 'A0A0M6WC26: /468-573', 'A0A0M6WCZ2: /471-575', 'A0A0M9AS13: /465-568', 'A0A0R2HL96: /444-546', 'A0A0S2W112: /458-562', 'A0A0U5JBG5: /462-567', 'A0A0W7TMH6: /453-557', 'A0A0X8VAS2: /470-574', 'A0A136Q6K1: /450-555', 'A0A136WGT4: /467-571', 'A0A140DTE7: /449-554', 'A0A143Y517: /467-571', 'A0A143Y9S4: /482-585', 'A0A143YNF2: /482-585', 'A0A143YTA3: /445-547', 'A0A143YWY4: /444-546', 'A0A143YZV1: /445-547', 'A0A143Z212: /482-585', 'A0A143ZCA8: /464-569', 'A0A151AI36: /440-544', 'A0A169X118: /449-553', 'A0A173R3X4: /466-571', 'A0A173R662: /473-576', 'A0A173SMJ4: /468-573', 'A0A173TUR6: /441-545', 'A0A173V441: /464-569', 'A0A173W538: /4

In [26]:
cleaned_data2

['A0A011VQM7: /485-590',
 'A0A063ZJU6: /476-579',
 'A0A081EVR3: /449-552',
 'A0A099I9L1: /456-560',
 'A0A0D8IWY3: /453-557',
 'A0A0E2HBL2: /452-556',
 'A0A0F0CIE1: /451-556',
 'A0A0F9HQB3: /450-554',
 'A0A0G3WG46: /442-546',
 'A0A0G9LEF6: /464-569',
 'A0A0H5SG89: /474-578',
 'A0A0J6WZA7: /426-529',
 'A0A0J6ZRU4: /460-564',
 'A0A0J9EMW5: /451-556',
 'A0A0M2NHW4: /460-565',
 'A0A0M6WC26: /468-573',
 'A0A0M6WCZ2: /471-575',
 'A0A0M9AS13: /465-568',
 'A0A0R2HL96: /444-546',
 'A0A0S2W112: /458-562',
 'A0A0U5JBG5: /462-567',
 'A0A0W7TMH6: /453-557',
 'A0A0X8VAS2: /470-574',
 'A0A136Q6K1: /450-555',
 'A0A136WGT4: /467-571',
 'A0A140DTE7: /449-554',
 'A0A143Y517: /467-571',
 'A0A143Y9S4: /482-585',
 'A0A143YNF2: /482-585',
 'A0A143YTA3: /445-547',
 'A0A143YWY4: /444-546',
 'A0A143YZV1: /445-547',
 'A0A143Z212: /482-585',
 'A0A143ZCA8: /464-569',
 'A0A151AI36: /440-544',
 'A0A169X118: /449-553',
 'A0A173R3X4: /466-571',
 'A0A173R662: /473-576',
 'A0A173SMJ4: /468-573',
 'A0A173TUR6: /441-545',


In [27]:
updated_entries = []
for entry in cleaned_data2:
    prefix, range_part = entry.split("/")

    # Extract the two numbers and subtract 1 from the first
    first, second = map(int, range_part.split("-"))
    first -= 1  # Subtract 1

    # Rebuild the string
    updated_entries.append(f"{prefix}/{first}-{second}")

# Output the updated list
for item in updated_entries:
    print(item)

A0A011VQM7: /484-590
A0A063ZJU6: /475-579
A0A081EVR3: /448-552
A0A099I9L1: /455-560
A0A0D8IWY3: /452-557
A0A0E2HBL2: /451-556
A0A0F0CIE1: /450-556
A0A0F9HQB3: /449-554
A0A0G3WG46: /441-546
A0A0G9LEF6: /463-569
A0A0H5SG89: /473-578
A0A0J6WZA7: /425-529
A0A0J6ZRU4: /459-564
A0A0J9EMW5: /450-556
A0A0M2NHW4: /459-565
A0A0M6WC26: /467-573
A0A0M6WCZ2: /470-575
A0A0M9AS13: /464-568
A0A0R2HL96: /443-546
A0A0S2W112: /457-562
A0A0U5JBG5: /461-567
A0A0W7TMH6: /452-557
A0A0X8VAS2: /469-574
A0A136Q6K1: /449-555
A0A136WGT4: /466-571
A0A140DTE7: /448-554
A0A143Y517: /466-571
A0A143Y9S4: /481-585
A0A143YNF2: /481-585
A0A143YTA3: /444-547
A0A143YWY4: /443-546
A0A143YZV1: /444-547
A0A143Z212: /481-585
A0A143ZCA8: /463-569
A0A151AI36: /439-544
A0A169X118: /448-553
A0A173R3X4: /465-571
A0A173R662: /472-576
A0A173SMJ4: /467-573
A0A173TUR6: /440-545
A0A173V441: /463-569
A0A173W538: /469-574
A0A173X4T2: /470-575
A0A173XV88: /462-567
A0A173XVK8: /449-555
A0A173Z4K3: /455-560
A0A173ZTK6: /455-560
A0A174C5U1: /

In [28]:
id_map = {entry.split(":")[0]: entry.split(":")[1].strip().rstrip(",") for entry in updated_entries}

# Process list2 to insert the /number..number after the underscore
updated_list_nit = []

for entry in disc:
    parts = entry.split("|")
    if len(parts) > 2:
        protein_id = parts[1]  # Extract the ID
        id_info = parts[2]  # Extract the part that includes the second ID and underscore
        
        if protein_id in id_map:
            # Split id_info into the ID part and the next part after the underscore
            id_part, rest = id_info.split(" ", 1)
            # Rebuild id_info by inserting the number sequence after the underscore
            modified_id_info = f"{id_part}{id_map[protein_id]} {rest}"
            # Rebuild the full entry
            modified_entry = f"{parts[0]}|{parts[1]}|{modified_id_info} " + " ".join(parts[3:])
            updated_list_nit.append(modified_entry)
        else:
            updated_list_nit.append(entry)  # If no match, keep as is
    else:
        updated_list_nit.append(entry)  # If format is unexpected, keep as is

# Output the updated list
for item in updated_list_nit:
    print(item)

tr|A0A011VQM7|A0A011VQM7_RUMAL/484-590 Adenylate cyclase OS=Ruminococcus albus SY3 OX=1341156 GN=RASY3_13175 PE=3 SV=1 
tr|A0A063ZJU6|A0A063ZJU6_9EURY/475-579 Ammonium transporter OS=Halostagnicola sp. A56 OX=1495067 GN=EL22_10090 PE=3 SV=1 
tr|A0A081EVR3|A0A081EVR3_9EURY/448-552 Ammonium transporter OS=Halorubrum saccharovorum OX=2248 GN=FK85_01485 PE=3 SV=1 
tr|A0A099I9L1|A0A099I9L1_CLOIN/455-560 Adenylate cyclase OS=Clostridium innocuum OX=1522 GN=CIAN88_08185 PE=3 SV=1 
tr|A0A0D8IWY3|A0A0D8IWY3_9FIRM/452-557 Adenylate cyclase OS=Ruthenibacterium lactatiformans OX=1550024 GN=TQ39_16150 PE=3 SV=1 
tr|A0A0E2HBL2|A0A0E2HBL2_9FIRM/451-556 Ammonium transporter OS=[Clostridium] clostridioforme 90A8 OX=999408 GN=HMPREF1090_02007 PE=3 SV=1 
tr|A0A0F0CIE1|A0A0F0CIE1_9CLOT/450-556 Ammonium transporter NrgA OS=Clostridium sp. FS41 OX=1609975 GN=nrgA PE=3 SV=1 
tr|A0A0F9HQB3|A0A0F9HQB3_9ZZZZ/449-554 Ammonium transporter AmtB-like domain-containing protein (Fragment) OS=marine sediment metagenom

In [29]:
fasta2

NameError: name 'fasta2' is not defined

In [30]:
with open('fasta2.fasta', 'w') as fp:
    for iteme in fasta2:
        # write each item on a new line
        fp.write("%s\n" % iteme)
    print('Done')

NameError: name 'fasta2' is not defined

In [31]:
u.search('A0A011VQM7', frmt="tsv", columns='ft_transmem')

'Transmembrane\nTRANSMEM 20..40; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 61..78; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 98..119; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 131..152; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 164..186; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 207..226; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 241..262; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 269..287; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 293..313; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 325..344; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"; TRANSMEM 374..399; /note="Helical"; /evidence="ECO:0000256|SAM:Phobius"\n'

In [32]:
# def parse_ft_transmembrane(nitrogen):
#     for d in nitrogen:
#         positions = re.findall("(?<='start': )(?P<tmemstart>[0-9]*), 'end': (?P<tmemend>[0-9]*)", d)
#         return ((int(start), int(end)) for start, end in positions)

In [33]:

def parse_ft_transmembrane(nitrogen):
    positions_list = []
    for d in nitrogen:
        positions = re.findall(r"(?<='start': )(?P<tmemstart>[0-9]*), 'end': (?P<tmemend>[0-9]*)", d)
        if positions:
            positions_list.append((int(positions[0][0]), int(positions[0][1])))
    return positions_list

In [34]:
id_nit_short = ['A0A011VQM7',
 'A0A063ZJU6',
 'A0A081EVR3',
 'A0A099I9L1',
 'A0A0D8IWY3',
 'A0A0E2HBL2',
 'A0A0F0CIE1',
 'A0A0F9HQB3',
 'A0A0G3WG46',
 'A0A0G9LEF6',
 'A0A0H5SG89',
 'A0A0J6WZA7',
 'A0A0J6ZRU4',
 'A0A0J9EMW5',]

In [36]:
idandseq_test = []
seqlist = []
all_records = []

# Extract start-end positions and map to UniProt IDs
positions_dict = dict(zip(id_nit, parse_ft_transmembrane(nitrogen)))

for k in id_nit:
    # Fetch sequence from UniProt
    record = SeqIO.read(StringIO(u.search(k, frmt="fasta")), "fasta")
    
    # Get correct start-end positions for this UniProt ID
    if k in positions_dict:
        start, end = positions_dict[k]  # Unpack start and end

        # Extract the relevant sequence region
        seq = record.seq[start-2:end]  # Adjust for 0-based indexing
        
        # Create a new SeqRecord
        new_record = SeqIO.SeqRecord(seq, id=k, description="")
        all_records.append(new_record)

        print(len(seq))  # Print length of extracted sequence

# Write sequences to a FASTA file
with open("nitrogen.fasta", "w") as output_handle:
    SeqIO.write(all_records, output_handle, "fasta")



107
105
105
106
106
106
107
106
106
107
106
105
106
107
107
107
106
105
104
106
107
106
106
107
106
107
106
105
105
104
104
104
105
107
106
106
107
105
107
106
107
106
106
106
107
106
106
107
106
106
107
106
106
106
107
106
105
106
106
107
107
105
106
106
105
106
105
106
108
107
106
106
107
106
106
105
107
108
107
107
107
106
106
107
106
106
106
106
107
106
105
105
106
105
105
106
107
105
106
106
105
106
105
107
105
105
105
107
106
106
107
107
106
106
107
106
106
107
106
105
106
106
106
106
106
107
108
106
107
106
106
106
105
106
107
58
107
107
106
106
106
106
106
106
106
107
107
106
106
107
104
106
106
105
105
107
106
105
105
107
106
107
103
105
104
107
107
107
105
106
107
105
106
106
105
105
106
105
107
107
106
107
103
106
106
106
106
106
106
107
107
106
106
107
107
106
106
106
107
106
106
105
107
106
105
107
106
106
107
107
107
107
106
105
105
107
106
105
107
107
106
106
106
106
107
106
106
107
103
105
104
106
107
107
106
106
106
106
103
106
105
105
105
106
107
106
105
107
107
106
1

In [None]:
!seqkit grep --invert-match -f id_to_use.txt nitrogen.fasta > nitrogen_to_use.fasta


In [37]:
nit_dict = {}

for line in updated_list_nit:
    parts = line.split("|")
    if len(parts) > 2:
        key = parts[1]
        nit_dict[key] = line

# Print the resulting dictionary
for k, v in nit_dict.items():
    print(f"{k}: {v}")

A0A011VQM7: tr|A0A011VQM7|A0A011VQM7_RUMAL/484-590 Adenylate cyclase OS=Ruminococcus albus SY3 OX=1341156 GN=RASY3_13175 PE=3 SV=1 
A0A063ZJU6: tr|A0A063ZJU6|A0A063ZJU6_9EURY/475-579 Ammonium transporter OS=Halostagnicola sp. A56 OX=1495067 GN=EL22_10090 PE=3 SV=1 
A0A081EVR3: tr|A0A081EVR3|A0A081EVR3_9EURY/448-552 Ammonium transporter OS=Halorubrum saccharovorum OX=2248 GN=FK85_01485 PE=3 SV=1 
A0A099I9L1: tr|A0A099I9L1|A0A099I9L1_CLOIN/455-560 Adenylate cyclase OS=Clostridium innocuum OX=1522 GN=CIAN88_08185 PE=3 SV=1 
A0A0D8IWY3: tr|A0A0D8IWY3|A0A0D8IWY3_9FIRM/452-557 Adenylate cyclase OS=Ruthenibacterium lactatiformans OX=1550024 GN=TQ39_16150 PE=3 SV=1 
A0A0E2HBL2: tr|A0A0E2HBL2|A0A0E2HBL2_9FIRM/451-556 Ammonium transporter OS=[Clostridium] clostridioforme 90A8 OX=999408 GN=HMPREF1090_02007 PE=3 SV=1 
A0A0F0CIE1: tr|A0A0F0CIE1|A0A0F0CIE1_9CLOT/450-556 Ammonium transporter NrgA OS=Clostridium sp. FS41 OX=1609975 GN=nrgA PE=3 SV=1 
A0A0F9HQB3: tr|A0A0F9HQB3|A0A0F9HQB3_9ZZZZ/449-554 

In [38]:
nit_dict

{'A0A011VQM7': 'tr|A0A011VQM7|A0A011VQM7_RUMAL/484-590 Adenylate cyclase OS=Ruminococcus albus SY3 OX=1341156 GN=RASY3_13175 PE=3 SV=1 ',
 'A0A063ZJU6': 'tr|A0A063ZJU6|A0A063ZJU6_9EURY/475-579 Ammonium transporter OS=Halostagnicola sp. A56 OX=1495067 GN=EL22_10090 PE=3 SV=1 ',
 'A0A081EVR3': 'tr|A0A081EVR3|A0A081EVR3_9EURY/448-552 Ammonium transporter OS=Halorubrum saccharovorum OX=2248 GN=FK85_01485 PE=3 SV=1 ',
 'A0A099I9L1': 'tr|A0A099I9L1|A0A099I9L1_CLOIN/455-560 Adenylate cyclase OS=Clostridium innocuum OX=1522 GN=CIAN88_08185 PE=3 SV=1 ',
 'A0A0D8IWY3': 'tr|A0A0D8IWY3|A0A0D8IWY3_9FIRM/452-557 Adenylate cyclase OS=Ruthenibacterium lactatiformans OX=1550024 GN=TQ39_16150 PE=3 SV=1 ',
 'A0A0E2HBL2': 'tr|A0A0E2HBL2|A0A0E2HBL2_9FIRM/451-556 Ammonium transporter OS=[Clostridium] clostridioforme 90A8 OX=999408 GN=HMPREF1090_02007 PE=3 SV=1 ',
 'A0A0F0CIE1': 'tr|A0A0F0CIE1|A0A0F0CIE1_9CLOT/450-556 Ammonium transporter NrgA OS=Clostridium sp. FS41 OX=1609975 GN=nrgA PE=3 SV=1 ',
 'A0A0F9H

In [40]:
with open('nitrogen.fasta') as original, open('nitrogen_headers.fasta', 'w') as corrected:
    for seq_record in SeqIO.parse(original, 'fasta'):
        if seq_record.id in nit_dict:
            seq_record.id = nit_dict[seq_record.id]
        SeqIO.write(seq_record, corrected, 'fasta')