In [None]:
# libraries
import os
import re
import numpy as np
import pandas as pd
from Bio import SeqIO
from tqdm.auto import tqdm
import pickle as pkl
import itertools
from scipy.stats import pearsonr

In [None]:
# id to codon and codon to id
id_to_codon = {idx:''.join(el) for idx, el in enumerate(itertools.product(['A', 'T', 'C', 'G'], repeat=3))}
codon_to_id = {v:k for k,v in id_to_codon.items()}

def make_dataframe(
    ribo_fpath: str, df_trans_to_seq, count_norm: str = "mean"
):
    ribo_fname = ribo_fpath.split("/")[-1].split(".")[0]

    # Import dataset with ribosome data
    df_ribo = pd.read_csv(
        ribo_fpath,
        sep=" ",
        on_bad_lines="warn",
        dtype=dict(gene="category", transcript="category"),
    ).rename(columns={"count": "counts"})

    # Define count normalization function
    if count_norm == "max":
        f_norm = lambda x: x / x.max()
    elif count_norm == "mean":
        f_norm = lambda x: x / x.mean()
    elif count_norm == "sum":
        f_norm = lambda x: x / x.sum()
    else:
        raise ValueError()

    # Create final dataframe
    final_df = (
        df_ribo.merge(df_trans_to_seq).assign(fname=ribo_fname)
        # Filter spurious positions at the end of the sequence
        .query("position_A_site <= n_codons * 3")
        # Compute normalized counts
        .assign(
            norm_counts=lambda df: df.groupby("gene", observed=True).counts.transform(
                f_norm
            )
        )
    )

    return final_df


def make_all_dataframes(
    fasta_path: str,
    rb_path: str,
    max_n_codons: int = 2000,
    count_norm: str = "mean",
):
    # Import FASTA
    data = []
    with open(fasta_path, mode="r") as handle:
        for record in SeqIO.parse(handle, "fasta"):
            data.append([record.id, str(record.seq)])

    # Create transcripts to sequences mapping

    df_trans_to_seq = pd.DataFrame(data, columns=["transcript", "sequence"])

    # Removes those sequences that have Ns
    sequence_has_n = df_trans_to_seq.sequence.str.contains("N", regex=False)
    df_trans_to_seq = df_trans_to_seq.loc[~sequence_has_n]

    # Number of codons in sequence
    df_trans_to_seq = df_trans_to_seq.assign(
        n_codons=lambda df: df.sequence.str.len() // 3
    )

    # Compute and merge dataframes
    dfs = [
        make_dataframe(
            ribo_fpath = rb_path,
            df_trans_to_seq=df_trans_to_seq.drop("sequence", axis=1),
            count_norm=count_norm,
        )
    ]
    dfs = pd.concat(dfs)
    for col in ["transcript", "gene", "fname"]:
        dfs[col] = dfs[col].astype("category")

    dfs = dfs.groupby(["transcript", "position_A_site"], observed=True)

    # Average replicates
    dfs = dfs.agg(dict(norm_counts="mean", gene="first")).reset_index()
    
    dfs = dfs.assign(codon_idx=lambda df: df.position_A_site // 3)
    dfs = dfs.groupby("transcript", observed=True)
    dfs = dfs.agg(
        {
            "norm_counts": lambda x: x.tolist(),
            "codon_idx": lambda x: x.tolist(),
            "gene": "first",
        }
    ).reset_index()
    dfs = dfs.merge(df_trans_to_seq)

    dfs = dfs.assign(
        n_annot=lambda df: df.norm_counts.transform(lambda x: len(x))
        / (df.sequence.str.len() // 3)
    )

    dfs = dfs.assign(perc_annot=lambda df: df.n_annot / df.n_codons)

    # Filter by max sequence lenght
    dfs = dfs.query("n_codons<@max_n_codons")

    return dfs

# def fucntion sequence to codon ids
def sequence2codonids(seq):
    codon_ids = []
    for i in range(0, len(seq), 3):
        codon = seq[i:i+3]
        if len(codon) == 3:
            codon_ids.append(codon_to_id[codon])

    return codon_ids

def process_merged_df(df):
    # remove transcripts with N in sequence
    df = df[df['sequence'].str.contains('N') == False]

    codon_seqs = []
    sequences = list(df['sequence'])
    genes = list(df['gene'])
    transcripts = list(df['transcript'])
    perc_non_zero_annots = []
    norm_counts = list(df['norm_counts'])
    codon_idx = list(df["codon_idx"])
    annot_seqs = []

    for i in range(len(sequences)):
        seq = sequences[i]
        seq = sequence2codonids(seq)
        codon_seqs.append(seq)
        codon_idx_sample = codon_idx[i]
        # convert to list of int
        codon_idx_sample = [int(i) for i in codon_idx_sample[1:-1].split(',')]
        annot_seq_sample = []
        norm_counts_sample = [float(i) for i in norm_counts[i][1:-1].split(',')]
        for j in range(len(seq)):
            if j in codon_idx_sample:
                annot_seq_sample.append(norm_counts_sample[codon_idx_sample.index(j)])
            else:
                annot_seq_sample.append(0.0)
        annot_seqs.append(annot_seq_sample)

        # calculate percentage of non-zero annotations
        perc_non_zero_annots.append(sum([1 for i in annot_seq_sample if i != 0.0])/len(annot_seq_sample))

    final_df = pd.DataFrame(list(zip(genes, transcripts, codon_seqs, annot_seqs, perc_non_zero_annots)), columns = ['gene', 'transcript', 'codon_sequence', 'annotations', 'perc_non_zero_annots'])

    return final_df

In [None]:
DATA_FOLDER = '/net/lts2gdk0/mnt/scratch/lts2/nallapar/rb-prof/data/Jan_2024/Lina/'
LIVER_FOLDER = '/net/lts2gdk0/mnt/scratch/lts2/nallapar/rb-prof/data/Jan_2024/Liver/'
# merge the dataframes
fa_path = f'{DATA_FOLDER}/reference/ensembl.cds.fa'

test_data_path = '/nfs_home/nallapar/final/riboclette/riboclette/models/data/orig/test_0.3_NZ_20_PercNan_0.05.csv'
out_folder = '/net/lts2gdk0/mnt/scratch/lts2/nallapar/rb-prof/data/Jan_2024/cleaned_data/'

In [None]:
test_data = pd.read_csv(test_data_path)
genes_test_full = list(test_data['gene'])
transcripts_test_full = list(test_data['transcript'])

In [None]:
def annot_PCC_sim(x, y):
    x = np.array(x)
    y = np.array(y)
    # print(len(x), len(y))

    # print(x, y)

    nan_mask_x = np.isnan(x)
    nan_mask_y = np.isnan(y)
    nan_mask = np.logical_or(nan_mask_x, nan_mask_y)
    x = x[~nan_mask]
    y = y[~nan_mask]

    # print(x, y)
    if len(x) == 0:
        return np.nan
    else:
        return pearsonr(x, y)[0]
    

In [None]:
conditions = ['CTRL', 'LEU', 'ILE', 'VAL', 'LEU_ILE', 'LEU_ILE_VAL']

In [None]:

for condition in conditions:
    # for each gene in the CTRL data, check how similar the replicate annotations are
    # get the gene names
    print("Condition: ", condition)
    ctrl_files = os.listdir(out_folder + condition + '/')
    num_reps = len(ctrl_files)
    ctrl_genes = []
    ctrl_dfs = []

    for i in range(len(ctrl_files)):
        ctrl_df = pd.read_csv(out_folder + condition + '/' + ctrl_files[i])
        ctrl_df = ctrl_df[ctrl_df['gene'].isin(genes_test_full)]
        ctrl_df = ctrl_df[ctrl_df['transcript'].isin(transcripts_test_full)]

        ctrl_dfs.append(ctrl_df)
        ctrl_genes.extend(list(ctrl_df['transcript']))

    ctrl_genes = list(set(ctrl_genes))

    # permutations in which replicates can be split into 2 groups
    if num_reps > 5:
        # get 5 random perms from the list, dont calculate the full perms
        # generate 5 random permutations manually
        perms = []
        for i in range(5):
            perm = np.random.permutation(num_reps)
            perms.append(perm[:num_reps//2])
    else:      
        perms = list(itertools.combinations(range(num_reps), num_reps//2))

    group1_full = []
    group2_full = []

    if len(perms) >= 5:
        num_splits = 5
        # get 5 random perms from the list
        for i in range(num_splits):
            group1_full.append([j for j in perms[i]])
            group2_full.append([j for j in range(num_reps) if j not in perms[i]])
    else:
        num_splits = len(perms)
        for i in range(len(perms)):
            group1_full.append([j for j in perms[i]])
            group2_full.append([j for j in range(num_reps) if j not in perms[i]])

    print("Split Groups")

    print(group1_full)
    print(group2_full)

    # get the gene annotations for each gene
    # split replicates into 2 groups randomly
    split_pcc = []
    for k in range(num_splits):
        group1 = group1_full[k]
        group2 = group2_full[k]

        gene_annotations = {}
        for i in range(len(ctrl_genes)):
            gene = ctrl_genes[i]
            gene_annotations[gene] = []
            num_codons = 0
            for j in range(len(ctrl_files)):
                if gene in list(ctrl_dfs[j]['transcript']):
                    gene_df = ctrl_dfs[j][ctrl_dfs[j]['transcript'] == gene]
                    gene_annotations[gene].append([float(x) for x in gene_df['annotations'].values[0][1:-1].split(',')])
                    num_codons = len(gene_df['annotations'].values[0][1:-1].split(','))
                    break
            for j in range(len(ctrl_files)):
                gene_df = ctrl_dfs[j][ctrl_dfs[j]['transcript'] == gene]
                try:
                    gene_annotations[gene].append([float(x) for x in gene_df['annotations'].values[0][1:-1].split(',')])
                except:
                    gene_annotations[gene].append([np.nan for i in range(num_codons)])
            gene_annotations[gene] = [np.nanmean(np.array(gene_annotations[gene])[group1], axis = 0), np.nanmean(np.array(gene_annotations[gene])[group2], axis = 0)]

        # calculate the pairwise pearson correlation between the annotations and choose the least value
        gene_PCC = {}
        for i in range(len(ctrl_genes)):
            gene = ctrl_genes[i]
            gene_annotations_sample = gene_annotations[gene]
            # print(gene_annotations_sample)
            gene_PCC[gene] = annot_PCC_sim(gene_annotations_sample[0], gene_annotations_sample[1])
        
        # get the average of the gene PCC values
        # convert gene_PCC to df
        df = pd.DataFrame.from_dict(gene_PCC, orient = 'index', columns = ['PCC'])
        df.to_csv("df_reps_sims/" + condition + '_split_' + str(k) + '_gene_PCC.csv')
        gene_PCC_values_ctrl = list(gene_PCC.values())
        gene_PCC_avg_ctrl = np.nanmean(gene_PCC_values_ctrl)

        split_pcc.append(gene_PCC_avg_ctrl)

        print("Split " + str(k) + " gene PCC average: ", gene_PCC_avg_ctrl)

    print(condition + " final split PCC average: ", np.mean(split_pcc))

In [None]:
# Test instances only
# LEU_ILE_VAL final split PCC average:  0.9170877492072175
# LEU_ILE final split PCC average:  0.924799669884757
# VAL final split PCC average:  0.9239766993445738
# ILE final split PCC average:  0.8620134134078624
# LEU final split PCC average:  0.9322250754557618
# CTRL final split PCC average:  0.9770054420545804

In [None]:
# LEU_ILE_VAL final split PCC average:  0.7500969373121209
# VAL final split PCC average:  0.7611953402793622
# ILE final split PCC average:  0.7049178068697376
# LEU final split PCC average:  0.7758710738503477
# LEU_ILE final split PCC average:  0.7666773313577476
# CTRL final split PCC average:  0.8370606904527795