In [1]:
import polars as pl

In [2]:
#!/usr/bin/env python3
import re
from collections import defaultdict

# Read FASTA headers
transcript_variants = defaultdict(list)

with open('../Data/tea_convert_akitsu/Mgigas_akitsu_galba.HypotheticalTrans.fa', 'r') as f:
    for line in f:
        if line.startswith('>'):
            match = re.match(r'>g(\d+)\.t(\d+)', line)
            if match:
                gene_id = match.group(1)
                variant_num = int(match.group(2))
                transcript_variants[gene_id].append(variant_num)

# Analyze each gene
missing_lower_variants = []
gaps_in_variants = []

for gene_id in sorted(transcript_variants.keys(), key=int):
    variants = sorted(transcript_variants[gene_id])
    min_variant = min(variants)
    
    # Check if starts from .t1
    if min_variant > 1:
        missing_lower_variants.append((gene_id, min_variant, variants))
    
    # Check for gaps
    for i in range(len(variants) - 1):
        if variants[i+1] != variants[i] + 1:
            gaps_in_variants.append((gene_id, variants[i], variants[i+1], variants))

# Print results
print("=== Genes missing lower transcript variants ===")
print(f"Total: {len(missing_lower_variants)} genes")
for gene_id, min_v, all_v in missing_lower_variants[:20]:  # Show first 20
    missing = list(range(1, min_v))
    print(f"g{gene_id}: starts from .t{min_v}, missing {missing} (has: {all_v})")

if len(missing_lower_variants) > 20:
    print(f"... and {len(missing_lower_variants) - 20} more")

print("\n=== Genes with gaps in transcript variants ===")
print(f"Total: {len(gaps_in_variants)} gaps")
for gene_id, v1, v2, all_v in gaps_in_variants[:20]:  # Show first 20
    print(f"g{gene_id}: gap between .t{v1} and .t{v2} (has: {all_v})")

if len(gaps_in_variants) > 20:
    print(f"... and {len(gaps_in_variants) - 20} more gaps")

=== Genes missing lower transcript variants ===
Total: 44 genes
g527: starts from .t2, missing [1] (has: [2])
g1290: starts from .t4, missing [1, 2, 3] (has: [4])
g2405: starts from .t3, missing [1, 2] (has: [3])
g4165: starts from .t2, missing [1] (has: [2])
g5085: starts from .t2, missing [1] (has: [2])
g6014: starts from .t2, missing [1] (has: [2])
g6247: starts from .t2, missing [1] (has: [2])
g6961: starts from .t2, missing [1] (has: [2])
g7339: starts from .t2, missing [1] (has: [2])
g7451: starts from .t2, missing [1] (has: [2])
g7693: starts from .t3, missing [1, 2] (has: [3])
g9505: starts from .t2, missing [1] (has: [2])
g10995: starts from .t3, missing [1, 2] (has: [3])
g12709: starts from .t3, missing [1, 2] (has: [3])
g15194: starts from .t4, missing [1, 2, 3] (has: [4, 5])
g18450: starts from .t3, missing [1, 2] (has: [3, 4])
g18459: starts from .t2, missing [1] (has: [2])
g20313: starts from .t2, missing [1] (has: [2])
g20447: starts from .t2, missing [1] (has: [2])
g230

In [3]:
mmseqs_result = pl.read_csv(
    "../out/akitsu_uniref50/mmseqs2_easy_search_akitsu_galba_uniref50_cov_mode_0_rescore_mode_3.tsv.gz",
    separator="\t",
).drop(
    "qheader",
    "theader"
).with_columns(
    # Extract only the gene ID (e.g., "g39065") from the query column
    pl.col("query").str.extract(r'^(g\d+)', group_index=1).alias("Gene_level")
)

print(mmseqs_result.select("Gene_level").unique())
display(mmseqs_result)

shape: (418, 1)
┌────────────┐
│ Gene_level │
│ ---        │
│ str        │
╞════════════╡
│ g43472     │
│ g42388     │
│ g9741      │
│ g7778      │
│ g3698      │
│ …          │
│ g16042     │
│ g15556     │
│ g22564     │
│ g4885      │
│ g31527     │
└────────────┘


query,target,pident,fident,nident,qcov,tcov,alnlen,mismatch,gapopen,qlen,qstart,qend,tlen,tstart,tend,evalue,bits,qaln,taln,qframe,tframe,mismatch_duplicated_0,Gene_level
str,str,f64,f64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,str,i64,i64,i64,str
"""g11904.t1|H=0.318""","""UniRef50_K1Q3H1""",66.6,0.666,182,0.968,0.771,273,88,3,282,2,274,350,76,345,1.0250e-98,367,"""wCHWAchtTttmGPHPhTTTtYyHYAcvWA…","""WCHWA-HTTtTTgPHPtfTThYYHvACHWA…",1,3,88,"""g11904"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAD8BRS1""",74.2,0.742,202,0.906,0.79,272,53,5,287,1,260,338,7,273,3.3420e-139,500,"""TttTTTstTmiTTTTtTTTTTTTTTTftnP…","""TTTTTTTTTttttTTTtTdtTTTTTTTtNP…",0,3,53,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0A6J8ARR5""",72.6,0.726,207,0.955,0.831,285,62,6,287,12,285,337,32,311,1.3990e-138,498,"""TTTTtTTTTTTTTTTftnPIhCIMGPLfSM…","""TThtEthftTTTTTTTTNPIhCIMGPlTtt…",2,1,62,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAN8J484""",70.5,0.705,201,0.962,0.865,285,72,4,287,12,287,326,4,285,1.8750e-133,481,"""TTTTtTTTTTTTTTTftnPIhCIMGPLfSM…","""TTTTTthttTTTTTTttNPIlcImG--sTt…",2,3,72,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAE0YYS3""",69.6,0.696,207,0.99,0.888,297,74,7,287,1,284,331,3,296,6.7020e-132,476,"""TttTTTstTmiTTTTtTTTTTTTTTTftnP…","""tttTTttfHfyyftttTTTTTTTTThftng…",0,2,74,"""g19387"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g14358.t1|H=0.136""","""UniRef50_UPI0014254CA3""",76.7,0.767,89,0.943,0.75,116,22,2,123,8,123,148,37,147,4.9890e-60,236,"""TTTTTTTMNPIayIMGPQFFSYAIVWACIW…","""TTTTTTtMnPIscIMGPQFFSYAIVAACIw…",1,3,22,"""g14358"""
"""g14358.t1|H=0.136""","""UniRef50_R7U3B0""",71.0,0.71,81,0.927,0.898,114,25,2,123,6,119,118,13,118,7.9560e-50,202,"""TTTTTTTTTMNPIayIMGPQFFSYAIVWAC…","""TTTTTTtTtMNPIFCIMGPQfFSYAIVWAC…",2,3,25,"""g14358"""
"""g14358.t1|H=0.136""","""UniRef50_A0A7R9NPB9""",79.7,0.797,79,0.805,0.931,99,15,2,123,4,102,101,4,97,4.1860e-48,197,"""TTTTTTTTTTTMNPIayIMGPQFFSYAIVW…","""TTTTTTTTTTtMnPIfcIMGPQFFSYAIVW…",3,3,15,"""g14358"""
"""g19363.t1|H=0.156""","""UniRef50_A0A6J8ADW2""",75.8,0.758,110,1.0,0.824,145,35,0,145,1,145,176,19,163,7.5100e-98,359,"""MNVpEPEFQFSYGvTQFDsygIVcRVWCRV…","""MMVPEPEFQFSYGvAQFDyYcIVAlVWCRV…",0,3,35,"""g19363"""


In [4]:
UniRef50_list = mmseqs_result.select(
    pl.col("target")
).unique().sort(
    "target"
).write_csv(
    "../out/akitsu_uniref50/akitsu_uniref50_list_full.txt",
    separator="\n",
    include_header=False
)

&nbsp;

&nbsp;

&nbsp;

## Filtering Step (1)

In [5]:
mmseqs_result_filter1 = mmseqs_result.filter(
    (pl.col("qcov") > 0.5) &
    (pl.col("tcov") > 0.5)
)

display(mmseqs_result_filter1)

query,target,pident,fident,nident,qcov,tcov,alnlen,mismatch,gapopen,qlen,qstart,qend,tlen,tstart,tend,evalue,bits,qaln,taln,qframe,tframe,mismatch_duplicated_0,Gene_level
str,str,f64,f64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,str,i64,i64,i64,str
"""g11904.t1|H=0.318""","""UniRef50_K1Q3H1""",66.6,0.666,182,0.968,0.771,273,88,3,282,2,274,350,76,345,1.0250e-98,367,"""wCHWAchtTttmGPHPhTTTtYyHYAcvWA…","""WCHWA-HTTtTTgPHPtfTThYYHvACHWA…",1,3,88,"""g11904"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAD8BRS1""",74.2,0.742,202,0.906,0.79,272,53,5,287,1,260,338,7,273,3.3420e-139,500,"""TttTTTstTmiTTTTtTTTTTTTTTTftnP…","""TTTTTTTTTttttTTTtTdtTTTTTTTtNP…",0,3,53,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0A6J8ARR5""",72.6,0.726,207,0.955,0.831,285,62,6,287,12,285,337,32,311,1.3990e-138,498,"""TTTTtTTTTTTTTTTftnPIhCIMGPLfSM…","""TThtEthftTTTTTTTTNPIhCIMGPlTtt…",2,1,62,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAN8J484""",70.5,0.705,201,0.962,0.865,285,72,4,287,12,287,326,4,285,1.8750e-133,481,"""TTTTtTTTTTTTTTTftnPIhCIMGPLfSM…","""TTTTTthttTTTTTTttNPIlcImG--sTt…",2,3,72,"""g19387"""
"""g19387.t1|H=0.129""","""UniRef50_A0AAE0YYS3""",69.6,0.696,207,0.99,0.888,297,74,7,287,1,284,331,3,296,6.7020e-132,476,"""TttTTTstTmiTTTTtTTTTTTTTTTftnP…","""tttTTttfHfyyftttTTTTTTTTThftng…",0,2,74,"""g19387"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g14358.t1|H=0.136""","""UniRef50_UPI0014254CA3""",76.7,0.767,89,0.943,0.75,116,22,2,123,8,123,148,37,147,4.9890e-60,236,"""TTTTTTTMNPIayIMGPQFFSYAIVWACIW…","""TTTTTTtMnPIscIMGPQFFSYAIVAACIw…",1,3,22,"""g14358"""
"""g14358.t1|H=0.136""","""UniRef50_R7U3B0""",71.0,0.71,81,0.927,0.898,114,25,2,123,6,119,118,13,118,7.9560e-50,202,"""TTTTTTTTTMNPIayIMGPQFFSYAIVWAC…","""TTTTTTtTtMNPIFCIMGPQfFSYAIVWAC…",2,3,25,"""g14358"""
"""g14358.t1|H=0.136""","""UniRef50_A0A7R9NPB9""",79.7,0.797,79,0.805,0.931,99,15,2,123,4,102,101,4,97,4.1860e-48,197,"""TTTTTTTTTTTMNPIayIMGPQFFSYAIVW…","""TTTTTTTTTTtMnPIfcIMGPQFFSYAIVW…",3,3,15,"""g14358"""
"""g19363.t1|H=0.156""","""UniRef50_A0A6J8ADW2""",75.8,0.758,110,1.0,0.824,145,35,0,145,1,145,176,19,163,7.5100e-98,359,"""MNVpEPEFQFSYGvTQFDsygIVcRVWCRV…","""MMVPEPEFQFSYGvAQFDyYcIVAlVWCRV…",0,3,35,"""g19363"""


In [6]:
UniRef50_list = mmseqs_result_filter1.select(
    pl.col("target")
).unique().sort(
    "target"
).write_csv(
    "../out/akitsu_uniref50/akitsu_uniref50_list.txt",
    separator="\n",
    include_header=False
)

In [7]:
# UniRef50 idmapping (2025-12-21)
akitsu_uniref50_idmapping = pl.read_csv(
    "../out/akitsu_uniref50/akitsu_uniref50_idmapping_20260102.tsv.gz",
    separator="\t"
).sort(
    "Size",
    descending=True
).rename(
    {
        "From": "target"
    }
)

display(akitsu_uniref50_idmapping)

target,Cluster ID,Cluster Name,Common taxon ID,Common taxon,Organism IDs,Types,Size,Organisms,Length,Identity,Cluster members,Date of last modification
str,str,str,i64,str,str,str,i64,str,i64,f64,str,str
"""UniRef50_A0A4W4G752""","""UniRef50_A0A4W4G752""","""Cluster: Rapunzel""",117571,"""Euteleostomi""","""8005; 47308; 7932; 143900; 153…","""UniProtKB Unreviewed (TrEMBL);…",285,"""Electrophorus electricus (Elec…",227,0.5,"""A0A4W4G752; A0A8C6SGQ9; A0A9D3…","""2025-10-08"""
"""UniRef50_W6R6I5""","""UniRef50_W6R6I5""","""Cluster: Putative Cro/CI famil…",379,"""Rhizobium""","""348824; 1864509; 990285; 12117…","""UniProtKB Unreviewed (TrEMBL);…",157,"""Rhizobium favelukesii; Rhizobi…",122,0.5,"""W6R6I5; A0A109J366; S3IMF5; K0…","""2025-10-08"""
"""UniRef50_A0A7K9E3E5""","""UniRef50_A0A7K9E3E5""","""Cluster: TTC3 ligase (Fragment…",8825,"""Neognathae""","""176943; 54386; 337173; 441696;…","""UniProtKB Unreviewed (TrEMBL)""",140,"""Baryphthengus martii (Rufous m…",259,0.5,"""A0A7K9E3E5; A0A7K6S777; A0A7K4…","""2025-10-08"""
"""UniRef50_A0A401PQB9""","""UniRef50_A0A401PQB9""","""Cluster: Uncharacterized prote…",7778,"""Elasmobranchii""","""75743; 137246; 7782; 685728; 1…","""UniProtKB Unreviewed (TrEMBL);…",93,"""Scyliorhinus torazame (Cloudy …",276,0.5,"""A0A401PQB9; A0A401RHJ6; UPI003…","""2025-10-08"""
"""UniRef50_A0A917DZQ8""","""UniRef50_A0A917DZQ8""","""Cluster: KfrA N-terminal DNA-b…",1224,"""Pseudomonadota""","""1703339; 3050082; 2692619; 330…","""UniProtKB Unreviewed (TrEMBL);…",85,"""Croceicoccus mobilis; Sphingos…",67,0.5,"""A0A917DZQ8; A0ABU3Q5F0; A0A7X4…","""2025-10-08"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""UniRef50_A0A9Q1BBE6""","""UniRef50_A0A9Q1BBE6""","""Cluster: Retrotransposon gag d…",206669,"""Holothuria leucospilota""","""206669""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Holothuria leucospilota (Black…",204,0.5,"""A0A9Q1BBE6""","""2023-11-08"""
"""UniRef50_A0A5N5TF71""","""UniRef50_A0A5N5TF71""","""Cluster: Uncharacterized prote…",96803,"""Armadillidium nasatum""","""96803""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Armadillidium nasatum""",416,0.5,"""A0A5N5TF71""","""2023-02-22"""
"""UniRef50_A0A9Q1B9A3""","""UniRef50_A0A9Q1B9A3""","""Cluster: Uncharacterized prote…",206669,"""Holothuria leucospilota""","""206669""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Holothuria leucospilota (Black…",113,0.5,"""A0A9Q1B9A3""","""2023-09-13"""
"""UniRef50_A0A9W9TAG4""","""UniRef50_A0A9W9TAG4""","""Cluster: Helitron helicase-lik…",70096,"""Penicillium cinerascens""","""70096""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Penicillium cinerascens""",454,0.5,"""A0A9W9TAG4""","""2024-01-24"""


In [8]:
mmseqs_result_join = mmseqs_result_filter1.join(
    akitsu_uniref50_idmapping,
    on="target",
    how="inner"
).filter(
    (pl.col("evalue") < 1e-5)
).with_columns(
    pl.col("Types").str.contains(r"UniProtKB Reviewed \(Swiss-Prot\)").alias("is_reviewed")
).with_columns(
    pl.col("is_reviewed").any().over("Gene_level").alias("gene_has_reviewed")
).sort(
    "query",
    descending=True
)

mmseqs_result_join.write_csv(
    "../out/akitsu_uniref50/mmseqs2_akitsu_galba_uniref50_join.tsv",
    separator="\t"
)

pl.Config.set_fmt_str_lengths(100)
# pl.Config.set_tbl_rows(100)
print(mmseqs_result_join.select("Gene_level").unique())
display(mmseqs_result_join)

shape: (418, 1)
┌────────────┐
│ Gene_level │
│ ---        │
│ str        │
╞════════════╡
│ g11054     │
│ g6459      │
│ g4885      │
│ g19898     │
│ g46056     │
│ …          │
│ g34225     │
│ g42776     │
│ g20729     │
│ g29214     │
│ g15269     │
└────────────┘


query,target,pident,fident,nident,qcov,tcov,alnlen,mismatch,gapopen,qlen,qstart,qend,tlen,tstart,tend,evalue,bits,qaln,taln,qframe,tframe,mismatch_duplicated_0,Gene_level,Cluster ID,Cluster Name,Common taxon ID,Common taxon,Organism IDs,Types,Size,Organisms,Length,Identity,Cluster members,Date of last modification,is_reviewed,gene_has_reviewed
str,str,f64,f64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,str,i64,i64,i64,str,str,str,i64,str,str,str,i64,str,i64,f64,str,str,bool,bool
"""g995.t1|H=0.217""","""UniRef50_UPI002905E56B""",55.2,0.552,136,0.928,0.83,246,69,8,223,1,207,294,29,272,1.9530e-47,196,"""VaCHWAHVW----CHWAhmGsy------GHWACVWAHWACHWACWAAHsAgayFytta-------------------------ifytatg-AhwAaVAAH…","""VACHWACVWCHWACHWAHTSSYFSAGVWAHWACVAAHWACVWAHWACeWAHTTTTTTyytyytyhytHftafhfyYyYhfahIyfyTTtytasYgAvAAc…",0,1,69,"""g995""","""UniRef50_UPI002905E56B""","""Cluster: uncharacterized protein LOC132561648""",509963,"""Ylistrum balloti""","""509963""","""UniParc""",1,"""Ylistrum balloti""",294,0.5,"""UPI002905E56B""","""2024-01-24""",false,false
"""g9939.t1|H=0.236""","""UniRef50_UPI001F04033D""",58.4,0.584,114,0.761,0.828,195,74,4,251,10,200,232,13,204,3.7980e-50,206,"""NPRpQFIYYYTTTTtThFssSsasHwcHwaHwAtfTTTTHh---pHfqFtftyThhttttTtTtsfsgtfHfttfthgthfytsttftTttTTTHaHHah…","""NPRPQFIYYTTTTTftHSscHtTsyTtTttfttsttHHtHHfapphfqftYthttHFtyfTptttfythfttthqPhpq-FytHtAHHfyttlTthHfHt…",3,3,74,"""g9939""","""UniRef50_UPI001F04033D""","""Cluster: uncharacterized protein LOC124454281""",2897299,"""Xenia sp. Carnegie-2017""","""2897299""","""UniParc""",1,"""Xenia sp. Carnegie-2017""",232,0.5,"""UPI001F04033D""","""2022-04-27""",false,false
"""g9782.t1|H=0.243""","""UniRef50_K1PXM0""",84.5,0.845,82,0.942,0.924,97,15,0,103,1,97,105,1,97,4.6490e-51,206,"""TahHaAHWAhHWAHWAtTTTTTdaftTThTtTyyffyttstdyFsytghhptYtyyHtltttTtTthTTTTTTttTQFSYGVWCRVACEWAMwAtTT""","""TaHHwAHWAhHWAHwAaTTtTtdaftTtHttTyyfytTtptfyFtygghHphtTytHhttTttTTthTTTTTTTTTQFSYGVWCRVACEWAMmAgtt""",0,0,15,"""g9782""","""UniRef50_K1PXM0""","""Cluster: Uncharacterized protein""",29159,"""Magallana gigas""","""29159""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Magallana gigas (Pacific oyster)""",105,0.5,"""K1PXM0""","""2025-10-08""",false,false
"""g9782.t1|H=0.243""","""UniRef50_A0A8S9RKW4""",68.6,0.686,57,0.806,0.872,83,25,1,103,19,101,94,1,82,2.0160e-21,109,"""TTTTdaftTThTtTyyffyttstdyFsytghhptYtyyHtltttTtTthTTTTTTttTQFSYGVWCRVACEWAMwAtTTgftf""","""TTTTTTTTTTttTTTTtThTTTTTyghY-gHhttTTTTTTTTTtTTTTTTTTTTTTTTQFSYGVWCRVACEWAmsYtHtTTTf""",3,0,25,"""g9782""","""UniRef50_A0A8S9RKW4""","""Cluster: Myb/SANT-like DNA-binding domain-containing protein""",69181,"""Brassica cretica""","""69181""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Brassica cretica (Mustard)""",94,0.5,"""A0A8S9RKW4; A0A8S9K4T2""","""2024-11-27""",false,false
"""g9741.t1|H=0.232""","""UniRef50_A0A8S3RF68""",65.7,0.657,142,0.78,0.905,216,57,3,277,59,274,220,1,199,7.0520e-77,294,"""HyaHWACHWAHWAChWAHWACHWAHyayttTyTTyPHpqFYfHhtTtTyttTttttHtyyyytTTYTTTyTtThyyYTtyHthtTytthtttyTThwtht…","""hyAHWACHWAHWACeWAHWACHWAH---ttTHtT---------ThSttyThtsttHahyTyttLthtytTtTth-----yhHhattTtHtHttttHHtht…",1,0,57,"""g9741""","""UniRef50_A0A8S3RF68""","""Cluster: Uncharacterized protein""",6550,"""Mytilus edulis""","""6550""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Mytilus edulis (Blue mussel)""",220,0.5,"""A0A8S3RF68""","""2025-10-08""",false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g10056.t1|H=0.215""","""UniRef50_A0A8B6DVS4""",61.6,0.616,385,0.819,0.786,625,199,10,736,124,726,771,37,642,3.5990e-269,934,"""hVACHWAgSYGgREEVREEREErRECcRcchtTTTTTHHHfHwTtPHPQFfytsYHvachwacVacHWhtt--ttthtltHHshTTTTTgMgpNPRpQFI…","""HVACHWAISYGHREEVREERREcREECeEectctTThHFSfHggHlhfghgfssyHvACcwAalathTatthiFttTTTThhtTiFsTtmMgdNPEEeFI…",3,3,199,"""g10056""","""UniRef50_A0A8B6DVS4""","""Cluster: SSD domain-containing protein""",6548,"""Mytilus""","""29158; 6549""","""UniProtKB Unreviewed (TrEMBL); UniParc""",2,"""Mytilus galloprovincialis (Mediterranean mussel); Mytilus californianus""",771,0.5,"""A0A8B6DVS4; UPI002245C333""","""2023-02-22""",false,false
"""g10056.t1|H=0.215""","""UniRef50_A0ABQ9F577""",59.4,0.594,357,0.808,0.979,601,230,7,736,138,732,606,13,605,1.1380e-245,856,"""EVREEREErRECcRcchtTTTTTHHHfHwTtPHPQFfytsYHvachwacVacHWhttttthtltHHshTTTTTgMgpNPRpQFIYTTfHqFSYGVWCHWA…","""EvREEdREvREElwchHmtmHygHssgHWpHfhpHfi------nghwAivAChWAhWwaHyTygHhHtittHTmdGDNPREqFIYTTFHqFSYGVWCHWA…",2,3,230,"""g10056""","""UniRef50_A0ABQ9F577""","""Cluster: Uncharacterized protein""",220873,"""Tegillarca granosa""","""220873""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Tegillarca granosa (Malaysian cockle)""",606,0.5,"""A0ABQ9F577""","""2025-10-08""",false,false
"""g10056.t1|H=0.215""","""UniRef50_UPI00234F8357""",57.5,0.575,393,0.819,0.829,683,198,15,736,124,726,809,33,703,5.9710e-240,837,"""hVACHWAgSYGgREEVREEREErRECcRcchtTTTTTHHHfHwTtPHPQFfytsYHvachwacVacHWhttttthtltH------HshTTTTTgMgpNPR…","""HVACEWAiSYGHrEEvREEREEeREEVwEChtCtTfhHgHHhsThSH------vgrVaCHwavia--ahtttttHtLTHttlttTsTtTTTTTtDdDNPR…",3,2,198,"""g10056""","""UniRef50_UPI00234F8357""","""Cluster: uncharacterized protein LOC123533803""",6596,"""Mercenaria mercenaria""","""6596""","""UniParc""",1,"""Mercenaria mercenaria""",809,0.5,"""UPI00234F8357""","""2023-05-03""",false,false
"""g10056.t1|H=0.215""","""UniRef50_A0A7S1CF38""",55.1,0.551,353,0.825,0.795,640,225,13,736,120,726,769,25,635,1.0690e-202,714,"""HWachVACHWAgSYGgREEVREEREErRECcRcchtTTTTTHHHfHwTtPHPQFfytsYHvachwacVacHWhtttttht-ltHHshTTTTTgMgpNPRp…","""HYAcVWChvWcSSGghAaerrEErEErREEreelh----wnhhhfsSyGHHhfTfSSwAhVACHWAHEAChWTTTTtTTTstttTtstttTTmmmdNPRP…",2,3,225,"""g10056""","""UniRef50_A0A7S1CF38""","""Cluster: SSD domain-containing protein (Fragment)""",1486930,"""Bicosoecida sp. CB-2014""","""1486930""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Bicosoecida sp. CB-2014""",769,0.5,"""A0A7S1CF38; A0A7S1CGL1""","""2025-04-23""",false,false


In [10]:
mmseqs_result_join_filter2 = mmseqs_result_join.filter(
    # (pl.col("gene_has_reviewed") == True) &
    (pl.col("gene_has_reviewed") == True)
)

pl.Config.set_tbl_rows(100)
print(mmseqs_result_join_filter2.select("Gene_level").unique())
display(mmseqs_result_join_filter2)

shape: (2, 1)
┌────────────┐
│ Gene_level │
│ ---        │
│ str        │
╞════════════╡
│ g37089     │
│ g22068     │
└────────────┘


query,target,pident,fident,nident,qcov,tcov,alnlen,mismatch,gapopen,qlen,qstart,qend,tlen,tstart,tend,evalue,bits,qaln,taln,qframe,tframe,mismatch_duplicated_0,Gene_level,Cluster ID,Cluster Name,Common taxon ID,Common taxon,Organism IDs,Types,Size,Organisms,Length,Identity,Cluster members,Date of last modification,is_reviewed,gene_has_reviewed
str,str,f64,f64,i64,f64,f64,i64,i64,i64,i64,i64,i64,i64,i64,i64,f64,i64,str,str,i64,i64,i64,str,str,str,i64,str,str,str,i64,str,i64,f64,str,str,bool,bool
"""g37089.t1|H=0.213""","""UniRef50_Q4J8L6""",75.7,0.757,53,0.769,0.909,70,17,0,91,8,77,77,7,76,8.523e-33,146,"""FSYGhWAHfACgWACHWACewtcsyyswACfWAgvWACvACHWAHvWAfSyAHWACHWAHwWAHWAcHWA""","""FSYGHWAHVACHWACHWACVWACVAAHWACHWACVWAhVACHWACVWACvWAHWACHWACVWAHWACHWA""",1,3,17,"""g37089""","""UniRef50_Q4J8L6""","""Cluster: Membrane-associated ATPase epsilon chain""",2285,"""Sulfolobus acidocaldarius""","""330779; 2285; 1028567; 1028566""","""UniProtKB Reviewed (Swiss-Prot); UniProtKB Unreviewed (TrEMBL)""",4,"""Sulfolobus acidocaldarius (strain ATCC 33909 / DSM 639 / JCM 8929 / NBRC15157 / NCIMB 11770); Sulfol…",77,0.5,"""Q4J8L6; A0A0U3HJE7; M1IZ49; M1IW08""","""2023-05-03""",True,True
"""g37089.t1|H=0.213""","""UniRef50_A0A1F7UTD3""",64.8,0.648,48,0.813,0.871,74,26,0,91,8,81,85,6,79,1.113e-26,126,"""FSYGhWAHfACgWACHWACewtcsyyswACfWAgvWACvACHWAHvWAfSyAHWACHWAHwWAHWAcHWAHWwA""","""FSYGHWAHVACHWACEWACHWACHWAHvACHWACHWAHVACHWACHWAHVACHWACHWAcvWAHVACHWAHvaC""",1,2,26,"""g37089""","""UniRef50_A0A1F7UTD3""","""Cluster: Uncharacterized protein""",1802402,"""Candidatus Uhrbacteria bacterium RIFCSPLOWO2_01_FULL_47_25""","""1802402""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Candidatus Uhrbacteria bacterium RIFCSPLOWO2_01_FULL_47_25""",85,0.5,"""A0A1F7UTD3""","""2025-10-08""",False,True
"""g37089.t1|H=0.213""","""UniRef50_T1BNM0""",59.4,0.594,47,0.868,0.767,79,32,0,91,7,85,103,24,102,6.745e-26,123,"""fFSYGhWAHfACgWACHWACewtcsyyswACfWAgvWACvACHWAHvWAfSyAHWACHWAHwWAHWAcHWAHWwAtlTT""","""ffSyGHWaHVACHWACVAAHwAshssaGWWCHWAqHWAGVWAHWAchwAGHWAgVAAHWACVWAHWAChWAHWACTTTT""",3,2,32,"""g37089""","""UniRef50_T1BNM0""","""Cluster: Methyl-accepting chemotaxis protein (Fragment)""",410659,"""mine drainage metagenome""","""410659""","""UniProtKB Unreviewed (TrEMBL)""",1,"""mine drainage metagenome""",103,0.5,"""T1BNM0""","""2023-05-03""",False,True
"""g37089.t1|H=0.213""","""UniRef50_A0A9P8AI69""",62.1,0.621,46,0.813,0.804,74,28,0,91,8,81,92,10,83,1.3870000000000002e-25,122,"""FSYGhWAHfACgWACHWACewtcsyyswACfWAgvWACvACHWAHvWAfSyAHWACHWAHwWAHWAcHWAHWwA""","""FSYGhWAcVAAHWACVWACVAAHWACHWAHFsAgVAAHWACVWAHWACHWAcHWAcVWAHWWCHWACHWAHWWA""",1,3,28,"""g37089""","""UniRef50_A0A9P8AI69""","""Cluster: Uncharacterized protein""",45513,"""Scheffersomyces spartinae""","""45513""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Scheffersomyces spartinae""",92,0.5,"""A0A9P8AI69""","""2023-09-13""",False,True
"""g37089.t1|H=0.213""","""UniRef50_A0AB34Z524""",65.7,0.657,46,0.769,0.854,70,24,0,91,8,77,82,4,73,5.086e-24,117,"""FSYGhWAHfACgWACHWACewtcsyyswACfWAgvWACvACHWAHvWAfSyAHWACHWAHwWAHWAcHWA""","""FSYGHWCHVACHWACvwaHftFFSYGHVACHWACHWAHVACHWAHfttFSYGHWAcVWAHVACHWACHWA""",1,3,24,"""g37089""","""UniRef50_A0AB34Z524""","""Cluster: Plasmid partition protein""",64895,"""Borreliella""","""29518; 64895""","""UniProtKB Unreviewed (TrEMBL); UniParc""",3,"""Borreliella afzelii (Borrelia afzelii); Borreliella""",82,0.5,"""A0AB34Z524; UPI00017F3DF0; UPI001F1A5C45""","""2025-04-23""",False,True
"""g22068.t1|H=0.168""","""UniRef50_K1QKN5""",81.4,0.814,167,0.83,0.949,205,38,0,247,43,247,216,12,216,5.223e-139,499,"""WCHVWhHtssgHVWCHWARVWChWArVsrVwcRVAREWChEAhtTQsYHsACHWACVACHWACVWAHVACHWAHVACHgaHgCcaCHcACvACHWACVAC…","""wAHhwchfwwfHVWChWARVWChWarVsRVWcRVAREWChEaTTTqyyHvACHWACVACHWACVWCHWACHWAHVACHfaChHccCHwACvACHWACVAC…",3,2,38,"""g22068""","""UniRef50_K1QKN5""","""Cluster: Uncharacterized protein""",29159,"""Magallana gigas""","""29159""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Magallana gigas (Pacific oyster)""",216,0.5,"""K1QKN5""","""2025-10-08""",False,True
"""g22068.t1|H=0.168""","""UniRef50_UPI0014254D8B""",69.5,0.695,162,0.927,0.885,233,57,5,247,7,235,252,4,226,4.4950000000000004e-114,417,"""hrvReeEEEekrpLtttTfTHFSYHWAchWCrVAreWCHVWhHtssgH---VWCHWARVWChWArVsrVwcRVAREWChEAhtTQsYHsACHWACVACHW…","""hrrRREREEEEnpL-----ygSYyHVAhHWARVArvWArvAHEWaSgHsAgHWchVARVWcewSrVSCvWaRVAREWcHE-tTTqssGvACHWACVWCHW…",3,3,57,"""g22068""","""UniRef50_UPI0014254D8B""","""Cluster: uncharacterized protein LOC117115383""",1529436,"""Anneissia japonica""","""1529436""","""UniParc""",1,"""Anneissia japonica""",252,0.5,"""UPI0014254D8B""","""2020-06-17""",False,True
"""g22068.t1|H=0.168""","""UniRef50_A0A8S3VCH5""",68.9,0.689,158,0.915,0.914,229,62,6,247,7,232,244,4,226,5.7849999999999996e-111,407,"""hrvReeEEEekrpLtttTfTHFSYHWAchWCrVAreWCHVWhH--tssgHVWCHWARVWChWArVsrVwcRVAREWChEAhtTQsYHsACHWACVACHWA…","""HrvREEREVe-npLfttthFh-YYgvAgHWaRVArvWARvWChWalhSYGhWcrVARrWArWArVSCvWARVWREWAHE-tTTQwSHVACHWACVaCHWA…",3,3,62,"""g22068""","""UniRef50_A0A8S3VCH5""","""Cluster: Uncharacterized protein""",6550,"""Mytilus edulis""","""6550""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Mytilus edulis (Blue mussel)""",244,0.5,"""A0A8S3VCH5""","""2023-02-22""",False,True
"""g22068.t1|H=0.168""","""UniRef50_V4AIB3""",65.2,0.652,163,0.996,0.881,250,70,6,247,2,247,269,9,245,5.436e-106,390,"""FshhHhrvReeEEEekrpLtttTfTHFSYHWAchWCrVAreWCHVWhHtssgHVWCHWARVWChWArVsrVwcRVAREWChEAhtTQsYHsACHWACVAC…","""fsYhhRRRreeEeEepndLltH-----yyhvahHWARVAhvWCrVWceWAqfHvaRVWaRVWCrWarvSCvWARVWREWAHE-tTTqfSHVACHWACVAC…",1,2,70,"""g22068""","""UniRef50_V4AIB3""","""Cluster: Uncharacterized protein""",225164,"""Lottia gigantea""","""225164""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Lottia gigantea (Giant owl limpet)""",269,0.5,"""V4AIB3; V3ZYS2""","""2023-02-22""",False,True
"""g22068.t1|H=0.168""","""UniRef50_A0AA89BKK7""",69.8,0.698,160,0.919,0.802,229,53,7,247,8,234,268,9,223,2.274e-105,388,"""rvReeEEEekrpLtttTfTHFSY-HWAchWCrVAreWCHVWhHtssgHVWCHWARVWChWArVsrVwcRVAREWChEAhtTQsYHsACHWACVACHWACV…","""rRReEEehvprpLTTq----FSYGHWAcVWChVARvWCrVWHH-vaghvWch---VACHwaRVaCvVARVWrEWAH--HTTqfSHvACHWACVACHWACV…",1,2,53,"""g22068""","""UniRef50_A0AA89BKK7""","""Cluster: Uncharacterized protein""",66713,"""Pinctada imbricata""","""66713""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Pinctada imbricata (Atlantic pearl-oyster)""",268,0.5,"""A0AA89BKK7""","""2024-03-27""",False,True
