In [1]:
from collections import Counter

import plotly.express as px
import pyfaidx 
import pandas as pd

In [28]:
files = ('../data/IIPR002547.genes.fasta', '../random.fasta', "../IPR045863.faa")

datasets = {filename : pyfaidx.Fasta(filename) for filename in files}
datasets = {filename: [gene for gene in genes if set(str(gene)).issubset("ACGT")] for filename, genes in datasets.items()}



In [29]:
df = pd.DataFrame.from_records([(filename, len(gene)) for filename, genes in datasets.items() for gene in genes], columns=['filename', 'gene_length'])

px.violin(df, color='filename', x='gene_length', )

In [30]:
records = []
for filename, genes in datasets.items():
    all_nucs = ''.join(str(gene) for gene in genes)
    counts = Counter(all_nucs)
    for nuc, count in counts.items():
        records.append((filename, nuc, count / len(all_nucs)))


df = pd.DataFrame.from_records(records, columns=["filename", "nucleotide", "Relative Frequency"])

px.bar(df, barmode="group", x="nucleotide", y="Relative Frequency", color="filename")

In [31]:
def kmerize(sequence: str, k=3):
    return [sequence[i:i+k] for i in range(len(sequence) - k)]

records = []
for filename, genes in datasets.items():
    all_kmers  = [kmer for gene in genes for kmer in kmerize(str(gene))]
    counts = Counter(all_kmers)
    for kmer, count in counts.items():
        records.append((filename, kmer, count / len(all_kmers)))


df = pd.DataFrame.from_records(records, columns=["filename", "kmer", "Relative Frequency"])

px.violin(df, violinmode="group", hover_name="kmer", y="Relative Frequency", color="filename", points="all")