In [13]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.manifold import TSNE

In [8]:
#prepare separate tables containing RNA_Prints, Ribo_Prints and Ribo_Densities for each GEO Dataset
genes = pd.read_csv('Data/sacCer3 genes.csv')[['name','length']]
genes.columns = ['ORF','length']
datasets = pd.read_csv('Data/Datasets.csv')
RNA_dats,Ribo_dats,RperR_dats = genes[['ORF',]],genes[['ORF',]],genes[['ORF',]]
dataset_list,cluster_list = [],[],
for dataset in datasets['SeriesID']:
    #read in dataset
    this_set = pd.read_csv('Data/' + dataset + '.csv')
    #ignore datasets without RNA data
    if 'RNA_Prints' not in this_set.columns:
        continue
    #combine dataset with gene legnth information, then compute RNAs per cell, Ribosomes per gene and Ribosomes per RNA
    this_set = genes.merge(this_set,how='inner',on='ORF')
    this_set['RNA_RPKM'] = this_set['RNA_Prints'] / (this_set['length'] / 1000)
    this_set['RNAs'] = this_set['RNA_RPKM']/this_set['RNA_RPKM'].sum()*60000
    this_set['Ribos'] = this_set['Ribo_Prints']/this_set['Ribo_Prints'].sum()*170000
    this_set['RperR'] = (this_set['Ribos'] / this_set['RNAs'])
    #merge info for this dataset with all other dataset
    RNA_dats = RNA_dats.merge(this_set[['ORF','RNAs']],how='inner',on='ORF')
    Ribo_dats = Ribo_dats.merge(this_set[['ORF','Ribos']],how='inner',on='ORF')
    RperR_dats = RperR_dats.merge(this_set[['ORF','RperR']],how='inner',on='ORF')
    #dataset_list.append(dataset)
#remove rows with inf or nan
Ribo_dats = Ribo_dats.replace((-np.inf,np.inf),np.nan).dropna()
RNA_dats = RNA_dats.replace((-np.inf,np.inf),np.nan).dropna()
RperR_dats = RperR_dats.replace((-np.inf,np.inf),np.nan).dropna()
#transpose data for use with tsne
Ribo_dats = np.transpose(np.array(Ribo_dats.iloc[:,1:]))
RNA_dats = np.transpose(np.array(RNA_dats.iloc[:,1:]))
RperR_dats = np.transpose(np.array(RperR_dats.iloc[:,1:]))


In [None]:
repeats = 50
RNAs,Ribos,RperRs = np.array([[0,0]]),np.array([[0,0]]),np.array([[0,0]])

for n in range(repeats):
    Ribo_tsne = TSNE().fit_transform(Ribo_dats)
    Ribos = np.append(Ribos,Ribo_tsne,axis=0)
    RNA_tsne = TSNE().fit_transform(RNA_dats)
    RNAs = np.append(RNAs,RNA_tsne,axis=0)
    RperR_tsne = TSNE().fit_transform(RperR_dats)
    RperRs = np.append(RperRs,RperR_tsne,axis=0)


Ribos = Ribos[1:,:]
RNAs=RNAs[1:,:]
RperRs=RperRs[1:,:]


In [None]:
fig,ax = plt.subplots(1,3,sharey=True,sharex=True,figsize=(9,3))
sns.set_style("white")
sns.kdeplot(RNAs[:,0],RNAs[:,1], ax=ax[0],cmap='Blues',shade=True)
#ax[0].scatter(RNAs[:1000,0],RNAs[:1000,1],s=2,c='black',alpha=0.15),ax[0].set_title('RNAs')
ax[0].set_ylabel('Dimension 2')
sns.kdeplot(Ribos[:,0],Ribos[:,1], ax=ax[1],cmap='Blues',shade=True)
#ax[1].scatter(Ribos[:1000,0],Ribos[:1000,1],s=2,c='black',alpha=0.15),ax[1].set_title('Ribosomes')
ax[1].set_xlabel('Dimension 1')
sns.kdeplot(RperRs[:,0],RperRs[:,1], ax=ax[2],cmap='Blues',shade=True)
#ax[2].scatter(RperRs[:1000,0],RperRs[:1000,1],s=2,c='black',alpha=0.15),ax[2].set_title('TE')
ax[0].set_xlim((-250,+250)), ax[0].set_ylim((-250,+250)), ax[0].set_yticks([0]), ax[0].set_xticks([0])
#plt.savefig('Dataset_Similarities.svg')
plt.show()