In [3]:
import pandas as pd

quadruplexes = pd.read_csv("quadruplexes.txt", sep="\t", header=None)
quadruplexes.columns = ["seqid", "start", "end", "name", "score", "strand"]

gff = pd.read_csv("genomic.gff", sep="\t", comment='#', header=None)
gff.columns = ["seqid", "source", "type", "start", "end", "score", "strand", "phase", "attributes"]

In [4]:
quadruplexes

Unnamed: 0,seqid,start,end,name,score,strand
0,NC_081552.1,255454,255479,G4_+,0,+
1,NC_081552.1,357388,357409,G4_+,0,+
2,NC_081552.1,392981,393000,G4_+,0,+
3,NC_081552.1,497424,497452,G4_+,0,+
4,NC_081552.1,516530,516554,G4_+,0,+
...,...,...,...,...,...,...
19425,NW_026739410.1,48301,48328,G4_+,0,+
19426,NW_026739413.1,15247,15274,G4_+,0,+
19427,NW_026739413.1,28565,28585,G4_+,0,+
19428,NW_026739413.1,29088,29111,G4_+,0,+


In [5]:
gff

Unnamed: 0,seqid,source,type,start,end,score,strand,phase,attributes
0,NC_081552.1,RefSeq,region,1,271964629,.,+,.,ID=NC_081552.1:1..271964629;Dbxref=taxon:35570...
1,NC_081552.1,Gnomon,gene,966,22098,.,+,.,ID=gene-LOC131997755;Dbxref=GeneID:131997755;N...
2,NC_081552.1,Gnomon,lnc_RNA,966,22098,.,+,.,ID=rna-XR_009398415.1;Parent=gene-LOC131997755...
3,NC_081552.1,Gnomon,exon,966,1124,.,+,.,ID=exon-XR_009398415.1-1;Parent=rna-XR_0093984...
4,NC_081552.1,Gnomon,exon,1185,3100,.,+,.,ID=exon-XR_009398415.1-2;Parent=rna-XR_0093984...
...,...,...,...,...,...,...,...,...,...
378417,NW_026739412.1,RefSeq,region,1,29435,.,+,.,ID=NW_026739412.1:1..29435;Dbxref=taxon:35570;...
378418,NW_026739413.1,RefSeq,region,1,29751,.,+,.,ID=NW_026739413.1:1..29751;Dbxref=taxon:35570;...
378419,NW_026739414.1,RefSeq,region,1,38434,.,+,.,ID=NW_026739414.1:1..38434;Dbxref=taxon:35570;...
378420,NW_026739415.1,RefSeq,region,1,26270,.,+,.,ID=NW_026739415.1:1..26270;Dbxref=taxon:35570;...


In [6]:
def annotate_feature(quad, feature_df, label):
    results = []
    for i, row in quad.iterrows():
        overlap = feature_df[
            (feature_df["seqid"] == row["seqid"]) &
            (feature_df["start"] <= row["end"]) &
            (feature_df["end"] >= row["start"])
        ]
        if not overlap.empty:
            results.append(label)
        else:
            results.append(None)
    return results

In [7]:
quadruplexes["Exon"] = annotate_feature(quadruplexes, gff[gff["type"] == "exon"], "exon")
quadruplexes["Intron"] = annotate_feature(quadruplexes, gff[gff["type"] == "intron"], "intron")
quadruplexes["Gene"] = annotate_feature(quadruplexes, gff[gff["type"] == "gene"], "gene")

genes = gff[gff["type"] == "gene"].copy()
promoters = genes.copy()
promoters["start"], promoters["end"] = promoters["start"] - 1000, promoters["start"]

quadruplexes["Promoter"] = annotate_feature(quadruplexes, promoters, "promoter")

downstream = genes.copy()
downstream["start"], downstream["end"] = downstream["end"], downstream["end"] + 200
quadruplexes["Downstream"] = annotate_feature(quadruplexes, downstream, "downstream")

In [8]:
def count_and_fraction(df, col):
    count = df[col].notna().sum()
    fraction = count / len(df)
    return count, round(fraction, 3)

stats = {
    "Exons": count_and_fraction(quadruplexes, "Exon"),
    "Introns": count_and_fraction(quadruplexes, "Intron"),
    "Promoters": count_and_fraction(quadruplexes, "Promoter"),
    "Downstream": count_and_fraction(quadruplexes, "Downstream"),
    "Genes": count_and_fraction(quadruplexes, "Gene"),
    "Intergenic": (len(quadruplexes) - sum(quadruplexes[["Exon", "Intron", "Gene"]].notna().any(axis=1)),
                   round((len(quadruplexes) - sum(quadruplexes[["Exon", "Intron", "Gene"]].notna().any(axis=1))) / len(quadruplexes), 3))
}

table = pd.DataFrame(stats, index=["Count", "Fraction"]).T
table

Unnamed: 0,Count,Fraction
Exons,504.0,0.026
Introns,0.0,0.0
Promoters,279.0,0.014
Downstream,89.0,0.005
Genes,9878.0,0.508
Intergenic,9545.0,0.491
