In [1]:
import polars as pl

In [19]:
ggsearch_result_akitsu_uniref50 = pl.read_csv(
    "../out/akitsu_uniref50/ggsearch_akitsu_galba_uniref50.tsv.gz",
    separator="\t",
    has_header=False,
    new_columns=[
        "query",
        "target",
        "% identity",
        "alignment length",
        "mismatches",
        "gap opens",
        "q. start",
        "q. end",
        "s. start",
        "s. end",
        "e-value",
        "bit score",
        "aln_code"
    ]
).sort(
    [
        "query",
        "e-value"
    ],
    descending=[
        True,
        False
    ]
).with_columns(
    # Extract only the gene ID (e.g., "g39065") from the query column
    pl.col("query").str.extract(r'^(g\d+)', group_index=1).alias("Gene_level")
)

print(ggsearch_result_akitsu_uniref50.select("query").unique())
display(ggsearch_result_akitsu_uniref50)

shape: (673, 1)
┌───────────────────┐
│ query             │
│ ---               │
│ str               │
╞═══════════════════╡
│ g995.t1|H=0.217   │
│ g9939.t1|H=0.236  │
│ g9846.t1|H=0.240  │
│ g9844.t1|H=0.196  │
│ g9815.t1|H=0.224  │
│ …                 │
│ g10605.t1|H=0.168 │
│ g10527.t1|H=0.152 │
│ g10526.t1|H=0.228 │
│ g10266.t1|H=0.254 │
│ g10056.t1|H=0.215 │
└───────────────────┘


query,target,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,e-value,bit score,aln_code,Gene_level
str,str,f64,i64,i64,i64,i64,i64,i64,i64,f64,f64,str,str
"""g995.t1|H=0.217""","""UniRef50_A0A7S0KRN1""",65.15,198,69,41,1,223,1,214,0.55,26.3,"""14M5D21M15D21M1I13M2I28M13I53M…","""g995"""
"""g995.t1|H=0.217""","""UniRef50_A0A7J7ZJC7""",54.13,218,100,17,1,223,1,230,1.4,26.1,"""1I6M8I10M1I26M2D31M3D16M2I129M""","""g995"""
"""g995.t1|H=0.217""","""UniRef50_UPI002905E56B""",60.54,223,88,71,1,223,1,294,1.8,26.4,"""15M5I33M58I7M1I73M1I85M4I7M2I3…","""g995"""
"""g995.t1|H=0.217""","""UniRef50_A0AAD4GF09""",53.05,213,100,23,1,223,1,226,2.8,25.8,"""14M2I29M9D43M1D38M2I65M9I24M""","""g995"""
"""g995.t1|H=0.217""","""UniRef50_A0A7S2BYD6""",59.09,198,81,29,1,223,1,202,2.9,25.7,"""2D12M2D29M10D40M1D20M8D58M2D20…","""g995"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g10056.t1|H=0.215""","""UniRef50_UPI00295B9E7A""",54.83,600,271,171,1,736,1,635,4.4,28.5,"""5M6D41M4D59M3I9M3D11M3I29M8I30…","""g10056"""
"""g10056.t1|H=0.215""","""UniRef50_A0ABP0PNC2""",58.26,642,268,267,1,736,1,815,5.6,28.8,"""29M5D43M1I22M11D56M7D24M5I7M18…","""g10056"""
"""g10056.t1|H=0.215""","""UniRef50_A0A8B7YIE5""",55.24,639,286,176,1,736,1,718,6.0,28.5,"""8M4D63M23D7M15D36M4D13M2D23M14…","""g10056"""
"""g10056.t1|H=0.215""","""UniRef50_L8GH18""",54.12,643,295,185,1,736,1,735,6.3,28.6,"""1M3I24M13I32M5I67M3I39M9D7M4D1…","""g10056"""


&nbsp;

&nbsp;

## Fitering Condition (1)

In [20]:
ggsearch_result_akitsu_uniref50_filter1 = ggsearch_result_akitsu_uniref50.filter(
    (pl.col("e-value") < 1e-5)
)

print(ggsearch_result_akitsu_uniref50_filter1.select("query").unique())
display(ggsearch_result_akitsu_uniref50_filter1)

shape: (254, 1)
┌───────────────────┐
│ query             │
│ ---               │
│ str               │
╞═══════════════════╡
│ g9939.t1|H=0.236  │
│ g9844.t1|H=0.196  │
│ g9782.t1|H=0.243  │
│ g9741.t1|H=0.232  │
│ g9634.t1|H=0.158  │
│ …                 │
│ g10796.t1|H=0.162 │
│ g10752.t1|H=0.229 │
│ g10605.t1|H=0.168 │
│ g10527.t1|H=0.152 │
│ g10056.t1|H=0.215 │
└───────────────────┘


query,target,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,e-value,bit score,aln_code,Gene_level
str,str,f64,i64,i64,i64,i64,i64,i64,i64,f64,f64,str,str
"""g9939.t1|H=0.236""","""UniRef50_A0A8B6HFM2""",56.4,250,109,34,1,251,1,283,0.0000053,30.2,"""1M16I25M7I131M1I25M1D68M9I""","""g9939"""
"""g9844.t1|H=0.196""","""UniRef50_E9I502""",68.45,168,53,36,1,168,1,204,2.1000e-11,32.0,"""3M2I3M13I12M11I77M1I31M1I3M1I6…","""g9844"""
"""g9844.t1|H=0.196""","""UniRef50_A0AAD9IWH6""",70.39,152,45,40,1,168,1,176,3.1000e-11,31.7,"""1M8I20M9I91M4I15M3I25M16D""","""g9844"""
"""g9844.t1|H=0.196""","""UniRef50_UPI001F5F1F47""",65.77,149,51,23,1,168,1,153,1.2000e-10,31.2,"""13M3I55M1D42M1D14M17D16M1I9M""","""g9844"""
"""g9844.t1|H=0.196""","""UniRef50_A0A8S3UHN7""",61.35,163,63,22,1,168,1,180,5.8000e-10,31.1,"""3M4D110M1I12M2I3M14I32M1D3M""","""g9844"""
…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g10527.t1|H=0.152""","""UniRef50_A0A6J8BHQ9""",72.68,366,100,141,1,413,1,460,0.0000021,31.8,"""32D47M14D128M1I22M3I5M35I10M40…","""g10527"""
"""g10527.t1|H=0.152""","""UniRef50_A0A8S3TG76""",77.96,363,80,182,1,413,1,495,0.0000044,31.7,"""6I8M49I232M1I3M3I3M9I7M32I7M26…","""g10527"""
"""g10527.t1|H=0.152""","""UniRef50_A0A8J2S6D4""",63.25,400,147,50,1,413,1,437,0.0000053,31.5,"""29M3I27M3I66M3D70M5I5M9I17M2D2…","""g10527"""
"""g10056.t1|H=0.215""","""UniRef50_K1R9P2""",78.37,726,157,12,1,736,1,728,1.7000e-25,41.6,"""1M9D698M1D20M1I4M1I3M""","""g10056"""


In [21]:
UniRef50_list = ggsearch_result_akitsu_uniref50_filter1.select(
    pl.col("target")
).unique().sort(
    "target"
).write_csv(
    "../out/akitsu_uniref50/akitsu_uniref50_list_ggsearch.txt",
    separator="\n",
    include_header=False
)

In [22]:
akitsu_uniref50_list_ggsearch = pl.read_csv(
    "../out/akitsu_uniref50/akitsu_uniref50_ggsearch_idmapping_20251226.tsv.gz",
    separator="\t"
).sort(
    "Size",
    descending=True
).rename(
    {
        "From": "target"
    }
)

display(akitsu_uniref50_list_ggsearch)

target,Cluster ID,Cluster Name,Common taxon ID,Common taxon,Organism IDs,Types,Size,Organisms,Length,Identity,Cluster members,Date of last modification
str,str,str,i64,str,str,str,i64,str,i64,f64,str,str
"""UniRef50_A0A2X4UG93""","""UniRef50_A0A2X4UG93""","""Cluster: Exodeoxyribonuclease …",2,"""Bacteria""","""38310; 619741; 372655; 1179773…","""UniProtKB Unreviewed (TrEMBL);…",6373,"""Rhodococcus coprophilus; Longi…",302,0.5,"""A0A2X4UG93; A0A8J7H4X2; A0ABR9…","""2025-10-08"""
"""UniRef50_P0A8U2""","""UniRef50_P0A8U2""","""Cluster: UPF0294 protein YafD""",131567,"""cellular organisms""","""83333; 331111; 574521; 585035;…","""UniProtKB Reviewed (Swiss-Prot…",2804,"""Escherichia coli (strain K12);…",266,0.5,"""P0A8U2; A7ZHU5; B7UJA5; B7MBI5…","""2025-10-08"""
"""UniRef50_A9BTM9""","""UniRef50_A9BTM9""","""Cluster: Glutamine cyclotransf…",3379134,"""Pseudomonadati""","""398578; 80866; 352475; 930166;…","""UniProtKB Unreviewed (TrEMBL);…",2662,"""Delftia acidovorans (strain DS…",218,0.5,"""A9BTM9; A0AAJ2R215; A0A6N8TH80…","""2025-10-08"""
"""UniRef50_A0A4W5M1C4""","""UniRef50_A0A4W5M1C4""","""Cluster: exodeoxyribonuclease …",131567,"""cellular organisms""","""62062; 2747562; 667676; 170157…","""UniProtKB Unreviewed (TrEMBL);…",2203,"""Hucho hucho (huchen); Paraburk…",290,0.5,"""A0A4W5M1C4; A0ABU8IQ76; A0A1H6…","""2025-10-08"""
"""UniRef50_Q9UEW8""","""UniRef50_Q9UEW8""","""Cluster: STE20/SPS1-related pr…",33213,"""Bilateria""","""9606; 10116; 9823; 10090; 9555…","""UniProtKB Reviewed (Swiss-Prot…",1614,"""Homo sapiens (Human); Rattus n…",545,0.5,"""Q9UEW8; O88506; Q863I2; Q9Z1W9…","""2025-10-08"""
…,…,…,…,…,…,…,…,…,…,…,…,…
"""UniRef50_UPI0034E58395""","""UniRef50_UPI0034E58395""","""Cluster: uncharacterized prote…",159855,"""Acropora muricata""","""159855""","""UniParc""",1,"""Acropora muricata""",521,0.5,"""UPI0034E58395""","""2024-11-27"""
"""UniRef50_A0AAD4ES69""","""UniRef50_A0AAD4ES69""","""Cluster: Mitochondrial divisio…",669026,"""Staphylotrichum longicolle""","""669026""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Staphylotrichum longicolle""",229,0.5,"""A0AAD4ES69""","""2025-02-05"""
"""UniRef50_UPI003B27FD00""","""UniRef50_UPI003B27FD00""","""Cluster: uncharacterized prote…",7038,"""Bemisia tabaci""","""7038""","""UniParc""",1,"""Bemisia tabaci""",287,0.5,"""UPI003B27FD00""","""2025-06-18"""
"""UniRef50_UPI003B284458""","""UniRef50_UPI003B284458""","""Cluster: uncharacterized prote…",7038,"""Bemisia tabaci""","""7038""","""UniParc""",1,"""Bemisia tabaci""",388,0.5,"""UPI003B284458""","""2025-06-18"""


In [24]:
ggsearch_result_join = ggsearch_result_akitsu_uniref50_filter1.join(
    akitsu_uniref50_list_ggsearch,
    on="target",
    how="inner"
).filter(
    (pl.col("e-value") < 1e-5)
).with_columns(
    pl.col("Types").str.contains(r"UniProtKB Reviewed \(Swiss-Prot\)").alias("is_reviewed")
).with_columns(
    pl.col("is_reviewed").any().over("Gene_level").alias("gene_has_reviewed")
).sort(
    [
        "query",
        "e-value"
    ],
    descending=[
        True,
        False
    ]
)

display(ggsearch_result_join)

query,target,% identity,alignment length,mismatches,gap opens,q. start,q. end,s. start,s. end,e-value,bit score,aln_code,Gene_level,Cluster ID,Cluster Name,Common taxon ID,Common taxon,Organism IDs,Types,Size,Organisms,Length,Identity,Cluster members,Date of last modification,is_reviewed,gene_has_reviewed
str,str,f64,i64,i64,i64,i64,i64,i64,i64,f64,f64,str,str,str,str,i64,str,str,str,i64,str,i64,f64,str,str,bool,bool
"""g9939.t1|H=0.236""","""UniRef50_A0A8B6HFM2""",56.4,250,109,34,1,251,1,283,0.0000053,30.2,"""1M16I25M7I131M1I25M1D68M9I""","""g9939""","""UniRef50_A0A8B6HFM2""","""Cluster: Uncharacterized prote…",6548,"""Mytilus""","""29158; 6550""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Mytilus galloprovincialis (Med…",283,0.5,"""A0A8B6HFM2; A0A8S3Q6A3""","""2025-04-23""",false,false
"""g9844.t1|H=0.196""","""UniRef50_E9I502""",68.45,168,53,36,1,168,1,204,2.1000e-11,32.0,"""3M2I3M13I12M11I77M1I31M1I3M1I6…","""g9844""","""UniRef50_E9I502""","""Cluster: Endonuclease/exonucle…",6669,"""Daphnia pulex""","""6669""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Daphnia pulex (Water flea)""",204,0.5,"""E9I502; E9I5P8""","""2025-02-05""",false,false
"""g9844.t1|H=0.196""","""UniRef50_A0AAD9IWH6""",70.39,152,45,40,1,168,1,176,3.1000e-11,31.7,"""1M8I20M9I91M4I15M3I25M16D""","""g9844""","""UniRef50_A0AAD9IWH6""","""Cluster: Endonuclease/exonucle…",53620,"""Paralvinella palmiformis""","""53620""","""UniProtKB Unreviewed (TrEMBL)""",3,"""Paralvinella palmiformis""",176,0.5,"""A0AAD9IWH6; A0AAD9IV16; A0AAD9…","""2024-11-27""",false,false
"""g9844.t1|H=0.196""","""UniRef50_UPI001F5F1F47""",65.77,149,51,23,1,168,1,153,1.2000e-10,31.2,"""13M3I55M1D42M1D14M17D16M1I9M""","""g9844""","""UniRef50_UPI001F5F1F47""","""Cluster: endonuclease/exonucle…",1561004,"""Candidatus Ichthyocystis sparu…","""1561004""","""UniParc""",1,"""Candidatus Ichthyocystis sparu…",153,0.5,"""UPI001F5F1F47""","""2023-11-08""",false,false
"""g9844.t1|H=0.196""","""UniRef50_A0A8S3UHN7""",61.35,163,63,22,1,168,1,180,5.8000e-10,31.1,"""3M4D110M1I12M2I3M14I32M1D3M""","""g9844""","""UniRef50_A0A8S3UHN7""","""Cluster: Endonuclease/exonucle…",6550,"""Mytilus edulis""","""6550""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Mytilus edulis (Blue mussel)""",180,0.5,"""A0A8S3UHN7""","""2023-05-03""",false,false
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
"""g10527.t1|H=0.152""","""UniRef50_A0A6J8BHQ9""",72.68,366,100,141,1,413,1,460,0.0000021,31.8,"""32D47M14D128M1I22M3I5M35I10M40…","""g10527""","""UniRef50_A0A6J8BHQ9""","""Cluster: Envelope fusion prote…",42192,"""Mytilus coruscus""","""42192""","""UniProtKB Unreviewed (TrEMBL)""",2,"""Mytilus coruscus (Sea mussel)""",460,0.5,"""A0A6J8BHQ9; A0A6J8C9A6""","""2025-04-23""",false,false
"""g10527.t1|H=0.152""","""UniRef50_A0A8S3TG76""",77.96,363,80,182,1,413,1,495,0.0000044,31.7,"""6I8M49I232M1I3M3I3M9I7M32I7M26…","""g10527""","""UniRef50_A0A8S3TG76""","""Cluster: Envelope fusion prote…",6548,"""Mytilus""","""6550; 6549""","""UniProtKB Unreviewed (TrEMBL);…",3,"""Mytilus edulis (Blue mussel); …",495,0.5,"""A0A8S3TG76; UPI002246A8B2; UPI…","""2023-05-03""",false,false
"""g10527.t1|H=0.152""","""UniRef50_A0A8J2S6D4""",63.25,400,147,50,1,413,1,437,0.0000053,31.5,"""29M3I27M3I66M3D70M5I5M9I17M2D2…","""g10527""","""UniRef50_A0A8J2S6D4""","""Cluster: Uncharacterized prote…",27404,"""Daphnia galeata""","""27404""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Daphnia galeata""",437,0.5,"""A0A8J2S6D4""","""2023-02-22""",false,false
"""g10056.t1|H=0.215""","""UniRef50_K1R9P2""",78.37,726,157,12,1,736,1,728,1.7000e-25,41.6,"""1M9D698M1D20M1I4M1I3M""","""g10056""","""UniRef50_K1R9P2""","""Cluster: Dispatched-like prote…",29159,"""Magallana gigas""","""29159""","""UniProtKB Unreviewed (TrEMBL)""",1,"""Magallana gigas (Pacific oyste…",728,0.5,"""K1R9P2""","""2025-10-08""",false,false
