In [1]:
import pandas as pd
import os

In [2]:
out = '../results/ModelDatasets/'
os.makedirs(out, exist_ok=True)

## Processing feature tables

In [3]:
# Load data

# load compiled datasets table
dfMain = pd.read_csv('../data/out_compiled_tx_features.txt', sep='\t', low_memory=False)
# GeRM peaks
dfGerms = pd.read_csv('../data/2023_Faraway/GeRMregions.tsv', sep='\t')
# Convert gene_id to stable gene id
dfGerms['gene_id'] = dfGerms['gene_id'].str.split('.').str[0]

# Summed MV score per transcript
dfMV = pd.read_csv('../data/2023_Faraway/summed_multivalency_score_per_cluster_per_transcript.tsv', sep='\t')
# add stable gene id
dfMV = dfMV.merge(dfMain[['transcript_id', 'gene_id']], on='transcript_id', how='left')

In [4]:
# # Convert df MV to three tables, with transcript ID as index and columns matching cluster_name_short
# DfGerms groupby, how many cluster_name per transcript
dfPeakNumber = dfGerms.groupby(['gene_id', 'cluster_name']).size().reset_index().rename(columns={0: 'peak_number'})
# Pivot table
dfPeakNumber = dfPeakNumber.pivot(index='gene_id', columns='cluster_name', values='peak_number').fillna(0)
dfPeakNumber.sort_values(by='A-rich Pur.', ascending=False, inplace=True)
# Tx Info
dfTxInfo = dfMain.copy()[['gene_id', 'cds_length', 'tx_length', 'longest_all_exons', 'number_all_exons', 'longest_coding_exon', 'number_coding_exon']].dropna().set_index('gene_id')
# Purine multivalency features; bring them to the same index
dfPurineMv = dfMain.copy()[['gene_id', 'purine_multivalency_score', 'percentage_score_distant_from_junction', 'purine_multivalency_class', 'purine_multivalency_location']].set_index('gene_id')

In [5]:
# Save data tables
dfPeakNumber.to_csv(f'{out}2023-Faraway_NGeRMPeaks.tsv', sep='\t')
dfTxInfo.to_csv(f'{out}2023-Faraway_TranscriptInfo.tsv', sep='\t')
dfPurineMv.to_csv(f'{out}2023-Faraway_PurineMvFeatures.tsv', sep='\t')