In [5]:
import GEOparse as geo
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import os
import sys
from sklearn.preprocessing import StandardScaler
import sklearn as sk
import scipy as sp

from tqdm import tqdm
from preprocess import *

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 500)

preprocess imported
module name : preprocess module package: 


***
# Loading sci-CAR-A549 Dataset
***

In [6]:
''' List the needed files for the analysis '''
sci_car_data_path = os.path.join(os.getcwd(), 'data', 'sci-CAR', 'GSE117089_RAW')
sci_car_files = []
for file in sorted(os.listdir(sci_car_data_path)):
    if not file.endswith('.gz') and not file == '.DS_Store':
        sci_car_files.append(file)

In [7]:
''' Load in all files as DataFrames, save in dictionary '''
sci_car_DEX_dict = {}
for f in tqdm(sci_car_files):
    if 'A549' in f:
        file_path = os.path.join(sci_car_data_path, f)  # get path of file 
#         print(file_path)
        s = f[:-4].split('_') # split data, remove .txt
        mod_name = f'{s[1]}_{s[2]}' # get modality name
        if 'count' in s: sep = ' '
        else: sep = ','
        df = pd.read_csv(file_path, sep=sep, header=0) #read df
        # add to dictionary
        if mod_name in sci_car_DEX_dict:
            sci_car_DEX_dict[mod_name][s[-1]] = df
        else:
            sci_car_DEX_dict[mod_name] = {s[-1]:df}
# data = pd.read_csv('output_list.txt', sep=" ", header=None)
print('Data loaded')

  interactivity=interactivity, compiler=compiler, result=result)
100%|██████████| 20/20 [00:05<00:00,  3.83it/s]

Data loaded





***
# Formatting RNA & ATAC Data Matrices
***

## Creating RNA Data Matrix

In [8]:
''' Make dataframe for RNA-sciCAR'''
# Inspect the RNA-sciCAR data
sci_RNA_dict = sci_car_DEX_dict['RNA_sciCAR']

cell, counts = sci_RNA_dict['cell'], sci_RNA_dict['count']

# add matrix column to gene counts
matrix_col = pd.RangeIndex(start=1, stop=6094)
cell['matrix'] = matrix_col

# Join cells and genes 
sci_RNA = pd.merge(cell, counts)

# Only select A549 cells 
sci_RNA = sci_RNA[sci_RNA['cell_name'] == 'A549']
sci_RNA = sci_RNA.drop(['integer', 'general'], axis=1)

# Now add gene_ids
sci_RNA = sci_RNA.rename(columns={'%%MatrixMarket': 'gene_ix'})
genes_dir = sci_car_DEX_dict['RNA_sciCAR']['gene']
genes_dir['gene_ix'] = pd.RangeIndex(start=1, stop=113154)
sci_RNA = pd.merge(sci_RNA, genes_dir)

In [9]:
'Plcxd3' in sci_car_DEX_dict['RNA_sciCAR']['gene']['gene_short_name'].values
# sci_car_DEX_dict['RNA_sciCAR']['gene']['gene_short_name']

True

In [10]:
''' Reformatting to get gene_ix and coordinate as features'''
sci_RNA_Mat = sci_RNA.copy(deep=True)
# Uses Pivot table to get each gene_ix as a new column/feature, value is coordinate value
sci_RNA_Mat = sci_RNA_Mat.pivot_table('coordinate', ['sample', 'matrix', 'treatment_time'], 'gene_ix')

# Ugly workaround to flatten the pivot-table and get back to a dataFrame
sci_RNA_Matrix = pd.DataFrame(sci_RNA_Mat.to_records())

In [11]:
'''Handling NaNs and Standardizing the Data'''
# Large value outlier?? - Coordinate value of 9251101 for matrix 6093, MatrixMarket113153
# Setting NaNs to 0
sci_RNA_Matrix.fillna(0, inplace=True)

# Standardizing all Gene_ix columns
stdScaler = StandardScaler()
sci_RNA_Matrix[sci_RNA_Matrix.columns[3:]] = stdScaler.fit_transform(sci_RNA_Matrix[sci_RNA_Matrix.columns[3:]])

# Adding prefix to feature columns
sci_RNA_Matrix.columns = ["rna_" + x if x not in sci_RNA_Matrix.columns[:3] else x for x in sci_RNA_Matrix.columns]

In [13]:
sci_RNA_Matrix["treatment_time"].value_counts()

3.0    1609
1.0    1491
0.0    1177
Name: treatment_time, dtype: int64

## Creating ATAC Data Matrix

In [196]:
''' Make dataframe for ATAC-sciCAR'''
sci_ATAC_dict = sci_car_DEX_dict['ATAC_sciCAR']

peak, count = sci_ATAC_dict['peak'], sci_ATAC_dict['count']

# count joins with cell using matrix and id
cell = sci_ATAC_dict['cell']

cell['matrix'] = pd.RangeIndex(start=1, stop=6086)
if 'mat' in cell.columns:
    cell = cell.drop('mat',axis=1)

# merge cells with counts
sci_ATAC = pd.merge(cell, count)

# merge with peaks
sci_ATAC = pd.merge(sci_ATAC, peak, left_on='%%MatrixMarket', right_on='id')
sci_ATAC = sci_ATAC[sci_ATAC['group'] != '293T_3T3']
sci_ATAC = sci_ATAC.drop(['real', 'general'],axis=1)

# split columns, drop group
sci_ATAC = split_column(sci_ATAC, col_to_split='group', col1_name='cell_name', col2_name='treatment_time', col1_ix=3, col2_ix=4)
if 'group' in sci_ATAC.columns:
    sci_ATAC = sci_ATAC.drop('group', axis=1) #drop if there

In [197]:
''' Reformatting to get gene_ix and coordinate as features'''
sci_ATAC_Mat = sci_ATAC.copy(deep=True)
# Uses Pivot table to get each %%MatrixMarket as a new column/feature, value is coordinate value
sci_ATAC_Mat = sci_ATAC_Mat.pivot_table('coordinate', ['sample', 'matrix', 'treatment_time'], '%%MatrixMarket')

# Ugly workaround to flatten the pivot-table and get back to a dataFrame
sci_ATAC_Matrix = pd.DataFrame(sci_ATAC_Mat.to_records())

In [None]:
'''Handling NaNs and Standardizing the Data'''
# Large value outlier?? - Coordinate value of 9251101 for matrix 6093, MatrixMarket113153
# Setting NaNs to 0
sci_ATAC_Matrix.fillna(0, inplace=True)

# Standardizing all Gene_ix columns
stdScaler = StandardScaler()
sci_ATAC_Matrix[sci_ATAC_Matrix.columns[3:]] = stdScaler.fit_transform(sci_ATAC_Matrix[sci_ATAC_Matrix.columns[3:]])

# Adding prefix to feature columns
sci_ATAC_Matrix.columns = ["atac_" + x if x not in sci_ATAC_Matrix.columns[:3] else x for x in sci_ATAC_Matrix.columns]

In [None]:
sci_ATAC["treatment_time"].value_counts()

## Creating Bimodal Data Matrix - Early Fusion

In [17]:
# Checking Intersection between the two datasets

# Matrix Elements in RNA not in ATAC
print("Number of elements in RNA Data that aren't in ATAC by Matrix: {}".format(len(set(sci_RNA_Matrix["matrix"].values) - set(sci_ATAC_Matrix["matrix"].values))))
# Matrix Elements in ATAC not in RNA
print("Number of elements in ATAC Data that aren't in RNA by Matrix: {}".format(len(set(sci_ATAC_Matrix["matrix"].values) - set(sci_RNA_Matrix["matrix"].values))))
# Matrix Total Intersection of Elements
print("Number of elements in both, by Matrix: {}".format(len(set(sci_ATAC_Matrix["matrix"].values) & set(sci_RNA_Matrix["matrix"].values))))

# Sample Elements in RNA not in ATAC
print("\nNumber of elements in RNA Data that aren't in ATAC by Sample: {}".format(len(set(sci_RNA_Matrix["sample"].values) - set(sci_ATAC_Matrix["sample"].values))))
# Sample Elements in ATAC not in RNA
print("Number of elements in ATAC Data that aren't in RNA by Sample: {}".format(len(set(sci_ATAC_Matrix["sample"].values) - set(sci_RNA_Matrix["sample"].values))))
# Sample Total Intersection of Elements
print("Number of elements in both, by Sample: {}".format(len(set(sci_ATAC_Matrix["sample"].values) & set(sci_RNA_Matrix["sample"].values))))


Number of elements in RNA Data that aren't in ATAC by Matrix: 1323
Number of elements in ATAC Data that aren't in RNA by Matrix: 1304
Number of elements in both, by Matrix: 2954

Number of elements in RNA Data that aren't in ATAC by Sample: 1017
Number of elements in ATAC Data that aren't in RNA by Sample: 998
Number of elements in both, by Sample: 3260


In [18]:
# Merging Datframes based on sample
# pd.concat([sci_RNA_Matrix, sci_ATAC_Matrix], axis=)
biModal_Matrix = sci_RNA_Matrix.merge(sci_ATAC_Matrix, how="inner", on="sample")

# Dropping columns matrix_y, and treatment_time_y
biModal_Matrix.drop(['matrix_y', 'treatment_time_y'], axis=1, inplace=True);

## Saving the DataFrames as CSVs and HDFs

In [19]:
# Minimizing RNA Matrix
rnaColDict = {k:"float16" for k in sci_RNA_Matrix.columns[3:]}
rnaColDict["treatment_time"] = 'int'
sci_RNA_Matrix = sci_RNA_Matrix.astype(rnaColDict)

# Minimizing ATAC Matrix
atacColDict = {k:"float16" for k in sci_ATAC_Matrix.columns[3:]}
atacColDict["treatment_time"] = 'int'
sci_ATAC_Matrix["treatment_time"] = sci_ATAC_Matrix["treatment_time"].str.rstrip('h')
sci_ATAC_Matrix = sci_ATAC_Matrix.astype(atacColDict)

# Minimizing Bimodal Matrix
biModalDict = {k:"float16" for k in biModal_Matrix.columns[3:]}
biModalDict["treatment_time_x"] = 'int'
biModal_Matrix = biModal_Matrix.astype(biModalDict)

In [20]:
# sci_RNA_Matrix.to_csv("rnaMatrix.csv",
#                      compression = 'gzip')
# sci_ATAC_Matrix.to_csv("atacMatrix.csv",
#                       compression = 'gzip')
# biModal_Matrix.to_csv("bimodalEarlyFusionMatrix.csv",
#                      compression = 'gzip')

In [21]:
sci_RNA_Matrix.to_hdf("rnaMatrix1",
                      key = 'stage',
                     mode='w')
sci_ATAC_Matrix.to_hdf("atacMatrix1",
                       key = 'stage',
                      mode='w')
biModal_Matrix.to_hdf("bimodalEarlyFusionMatrix1",
                      key = 'stage',
                     mode='w')

***
# Using Data-Clean Up and Feature Selection Methods from Paper
***

## Simple Clean-up from Supplementary Methods Section


### scRNA
* Remove Genes expressed in less than 10 cells
* Remove Cells with expression counts lower than 500
* Remove Cells with expression counts higher than 9100

### scATAC
* Remove Loci present in fewer than 5 cells
* Remove Cells with fewer than 200 accessible loci

Open Question: If cell fails in one modality, should it be removed from the other? Probably otherwise those datasets have more information than what's goin into the bimodal. Will probably have to go back to fix above

Should result in 2641 cells for use in downstream analysis

## scRNA Basic Cleanup

In [245]:
''' Make dataframe for RNA-sciCAR'''
# Inspect the RNA-sciCAR data
sci_RNA_dict = sci_car_DEX_dict['RNA_sciCAR']

cell, counts = sci_RNA_dict['cell'], sci_RNA_dict['count']

# add matrix column to gene counts
matrix_col = pd.RangeIndex(start=1, stop=6094)
cell['matrix'] = matrix_col

# Join cells and genes 
sci_RNA = pd.merge(cell, counts)

# Only select A549 cells 
sci_RNA = sci_RNA[sci_RNA['cell_name'] == 'A549']
sci_RNA = sci_RNA.drop(['integer', 'general'], axis=1)

# Now add gene_ids
sci_RNA = sci_RNA.rename(columns={'%%MatrixMarket': 'gene_ix'})
genes_dir = sci_car_DEX_dict['RNA_sciCAR']['gene']
genes_dir['gene_ix'] = pd.RangeIndex(start=1, stop=113154)
sci_RNA = pd.merge(sci_RNA, genes_dir)

In [246]:
''' Reformatting to get gene_ix and coordinate as features'''
sci_RNA_Mat = sci_RNA.copy(deep=True)
# Uses Pivot table to get each gene_ix as a new column/feature, value is coordinate value
sci_RNA_Mat = sci_RNA_Mat.pivot_table('coordinate', ['sample', 'matrix', 'treatment_time'], 'gene_ix')

# Ugly workaround to flatten the pivot-table and get back to a dataFrame
sci_RNA_Matrix = pd.DataFrame(sci_RNA_Mat.to_records())

In [247]:
'''Handling NaNs and Standardizing the Data'''
# Large value outlier?? - Coordinate value of 9251101 for matrix 6093, MatrixMarket113153
# Setting NaNs to 0
sci_RNA_Matrix.fillna(0, inplace=True)

# # Standardizing all Gene_ix columns
# stdScaler = StandardScaler()
# sci_RNA_Matrix[sci_RNA_Matrix.columns[3:]] = stdScaler.fit_transform(sci_RNA_Matrix[sci_RNA_Matrix.columns[3:]])

# Adding prefix to feature columns
sci_RNA_Matrix.columns = ["rna_" + x if x not in sci_RNA_Matrix.columns[:3] else x for x in sci_RNA_Matrix.columns]

In [248]:
rnaCellsToRemove = []
rnaGenesToRemove = []

# Collecting Cells/Row indices to remove based on total counts across all Genes
for cell in range(sci_RNA_Matrix.shape[0]):
    
    # Testing R Code -  Using the cutoff of cells with fewer than 200 expressed genes
    cellCount = sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count()
    if cellCount < 200:
#         print(sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count())
        rnaCellsToRemove.append(cell)

    
    # This method should probably be used for UMI counts, and these cutoffs
#     # Method that evaluates sum of values across all genes
#     if sci_RNA_Matrix.iloc[cell,3:].sum() < 500 or sci_RNA_Matrix.iloc[cell,3:].sum() > 9100:
# #         print(sci_RNA_Matrix.iloc[i,3:].sum())
#         rnaCellsToRemove.append(cell)

    # This method should be used for the total number of genes expressed by cells
#     # Method that evaluates counts of genes that are > 0.0
#     cellCount = sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count()
#     if cellCount < 500 or cellCount > 9100:
# #         print(sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count())
#         rnaCellsToRemove.append(cell)


# Collecting Genes/Column indices to remove based on total counts across all Genes
for gene in range(sci_RNA_Matrix.shape[1] - 3):
        
    #Testing R code - Using Min Cells of 1
    if (sci_RNA_Matrix.iloc[:, gene+3][sci_RNA_Matrix.iloc[:, gene+3] > 0.0].count() < 1):
#         print(sci_RNA_Matrix.iloc[:, gene+3][sci_RNA_Matrix.iloc[:, gene+3] > 0.0].count())
        rnaGenesToRemove.append(gene+3)
    
    
#     # Method that evaluates sum of values across all cells
#     if (sci_RNA_Matrix.iloc[:, gene+3].sum() < 10):
# #         print(sci_RNA_Matrix.iloc[:, gene+3].sum())
#         rnaGenesToRemove.append(gene+3)
    
    # Method that evaluates counts of cells that are > 0.0
#     if (sci_RNA_Matrix.iloc[:, gene+3][sci_RNA_Matrix.iloc[:, gene+3] > 0.0].count() < 10):
# #         print(sci_RNA_Matrix.iloc[:, gene+3][sci_RNA_Matrix.iloc[:, gene+3] > 0.0].count())
#         rnaGenesToRemove.append(gene+3)
    

In [249]:
# Remove the columns and rows based on indices collected

# Removing Genes/Columns
sci_RNA_Matrix_clean = sci_RNA_Matrix.drop(sci_RNA_Matrix.columns[rnaGenesToRemove], axis=1)

# Removing the Cells/Rows
sci_RNA_Matrix_clean.drop(rnaCellsToRemove, axis=0, inplace=True)
sci_RNA_Matrix_clean.reset_index();

## scATAC Basic Cleanup

In [250]:
''' Make dataframe for ATAC-sciCAR'''
sci_ATAC_dict = sci_car_DEX_dict['ATAC_sciCAR']

peak, count = sci_ATAC_dict['peak'], sci_ATAC_dict['count']

# count joins with cell using matrix and id
cell = sci_ATAC_dict['cell']

cell['matrix'] = pd.RangeIndex(start=1, stop=6086)
if 'mat' in cell.columns:
    cell = cell.drop('mat',axis=1)

# merge cells with counts
sci_ATAC = pd.merge(cell, count)

# merge with peaks
sci_ATAC = pd.merge(sci_ATAC, peak, left_on='%%MatrixMarket', right_on='id')
sci_ATAC = sci_ATAC[sci_ATAC['group'] != '293T_3T3']
sci_ATAC = sci_ATAC.drop(['real', 'general'],axis=1)

# split columns, drop group
sci_ATAC = split_column(sci_ATAC, col_to_split='group', col1_name='cell_name', col2_name='treatment_time', col1_ix=3, col2_ix=4)
if 'group' in sci_ATAC.columns:
    sci_ATAC = sci_ATAC.drop('group', axis=1) #drop if there

In [251]:
''' Reformatting to get gene_ix and coordinate as features'''
sci_ATAC_Mat = sci_ATAC.copy(deep=True)
# Uses Pivot table to get each %%MatrixMarket as a new column/feature, value is coordinate value
sci_ATAC_Mat = sci_ATAC_Mat.pivot_table('coordinate', ['sample', 'matrix', 'treatment_time'], '%%MatrixMarket')

# Ugly workaround to flatten the pivot-table and get back to a dataFrame
sci_ATAC_Matrix = pd.DataFrame(sci_ATAC_Mat.to_records())

In [252]:
'''Handling NaNs and Standardizing the Data'''
# Large value outlier?? - Coordinate value of 9251101 for matrix 6093, MatrixMarket113153
# Setting NaNs to 0
sci_ATAC_Matrix.fillna(0, inplace=True)

# # Standardizing all Gene_ix columns
# stdScaler = StandardScaler()
# sci_ATAC_Matrix[sci_ATAC_Matrix.columns[3:]] = stdScaler.fit_transform(sci_ATAC_Matrix[sci_ATAC_Matrix.columns[3:]])

# Adding prefix to feature columns
sci_ATAC_Matrix.columns = ["atac_" + x if x not in sci_ATAC_Matrix.columns[:3] else x for x in sci_ATAC_Matrix.columns]

In [253]:
atacCellsToRemove = []
atacLociToRemove = []

# Collecting Cells/Row indices to remove based on total counts across all Genes
for cell in range(sci_ATAC_Matrix.shape[0]):
    
    
    # Testing R Code -  Using the cutoff of cells with fewer than 200 expressed genes
    cellCount = sci_ATAC_Matrix.iloc[cell,3:][sci_ATAC_Matrix.iloc[cell,3:] > 0.0].count()
    if cellCount < 200:
#         print(sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count())
        rnaCellsToRemove.append(cell)

    
#     # Method that evaluates sum of values across all genes
#     cellCount = sci_ATAC_Matrix.iloc[cell,3:].sum()
#     if cellCount < 500 or cellCount > 9100:
# #         print(sci_ATAC_Matrix.iloc[i,3:].sum())
#         atacCellsToRemove.append(cell)

    
    # Method that evaluates counts of genes that are > 0.0
    cellCount = sci_ATAC_Matrix.iloc[cell,3:][sci_ATAC_Matrix.iloc[cell,3:] > 0.0].count()
    if cellCount < 200:
#         print(sci_RNA_Matrix.iloc[cell,3:][sci_RNA_Matrix.iloc[cell,3:] > 0.0].count())
        atacCellsToRemove.append(cell)


# Collecting Locus/Column indices to remove based on total counts across all Genes
for locus in range(sci_ATAC_Matrix.shape[1] - 3):
    
    #Testing R code - Using Min Cells of 1
    if (sci_ATAC_Matrix.iloc[:, locus+3][sci_ATAC_Matrix.iloc[:, locus+3] > 0.0].count() < 1):
#         print(sci_RNA_Matrix.iloc[:, gene+3][sci_RNA_Matrix.iloc[:, gene+3] > 0.0].count())
        rnaGenesToRemove.append(locus+3)
    
# #     Method that evaluates sum of values across all cells
#     if (sci_ATAC_Matrix.iloc[:, locus+3].sum() < 10):
# #         print(sci_ATAC_Matrix.iloc[:, locus+3].sum())
#         atacLociToRemove.append(locus+3)
    
# #     Method that evaluates counts of cells that are > 0.0
#     if (sci_ATAC_Matrix.iloc[:, locus+3][sci_ATAC_Matrix.iloc[:, locus+3] > 0.0].count() < 5):
# #         print(sci_ATAC_Matrix.iloc[:, locus+3][sci_ATAC_Matrix.iloc[:, locus+3] > 0.0].count())
#         atacLociToRemove.append(locus+3)
    

In [254]:
# Remove the columns and rows based on indices collected
# Removing Lic/Columns
sci_ATAC_Matrix_clean = sci_ATAC_Matrix.drop(sci_ATAC_Matrix.columns[atacLociToRemove], axis=1)

# Removing the Cells/Rows
sci_ATAC_Matrix_clean.drop(atacCellsToRemove, axis=0, inplace=True)
sci_ATAC_Matrix_clean.reset_index();


## Data Normalization and Creating Bimodal Data Matrix - Early Fusion

In [255]:
# Checking Intersection between the two datasets

# Matrix Elements in RNA not in ATAC
print("Number of elements in RNA Data that aren't in ATAC by Matrix: {}".format(len(set(sci_RNA_Matrix_clean["matrix"].values) - set(sci_ATAC_Matrix_clean["matrix"].values))))
# Matrix Elements in ATAC not in RNA
print("Number of elements in ATAC Data that aren't in RNA by Matrix: {}".format(len(set(sci_ATAC_Matrix_clean["matrix"].values) - set(sci_RNA_Matrix_clean["matrix"].values))))
# Matrix Total Intersection of Elements
print("Number of elements in both, by Matrix: {}".format(len(set(sci_ATAC_Matrix_clean["matrix"].values) & set(sci_RNA_Matrix_clean["matrix"].values))))

# Sample Elements in RNA not in ATAC
print("\nNumber of elements in RNA Data that aren't in ATAC by Sample: {}".format(len(set(sci_RNA_Matrix_clean["sample"].values) - set(sci_ATAC_Matrix_clean["sample"].values))))
# Sample Elements in ATAC not in RNA
print("Number of elements in ATAC Data that aren't in RNA by Sample: {}".format(len(set(sci_ATAC_Matrix_clean["sample"].values) - set(sci_RNA_Matrix_clean["sample"].values))))
# Sample Total Intersection of Elements
print("Number of elements in both, by Sample: {}".format(len(set(sci_ATAC_Matrix_clean["sample"].values) & set(sci_RNA_Matrix_clean["sample"].values))))


Number of elements in RNA Data that aren't in ATAC by Matrix: 2557
Number of elements in ATAC Data that aren't in RNA by Matrix: 790
Number of elements in both, by Matrix: 1720

Number of elements in RNA Data that aren't in ATAC by Sample: 2285
Number of elements in ATAC Data that aren't in RNA by Sample: 518
Number of elements in both, by Sample: 1992


In [256]:
# Modifying Unimodal Dataset so only cells in both are kept

# sci_RNA_Matrix_clean 
sci_RNA_Matrix_clean = sci_RNA_Matrix_clean[sci_RNA_Matrix_clean["sample"].isin(sci_ATAC_Matrix_clean["sample"])]
sci_ATAC_Matrix_clean = sci_ATAC_Matrix_clean[sci_ATAC_Matrix_clean["sample"].isin(sci_RNA_Matrix_clean["sample"])]

sci_RNA_Matrix_clean.reset_index(drop=True, inplace=True)
sci_ATAC_Matrix_clean.reset_index(drop=True, inplace=True);

# Make Test values here
sci_RNA_Matrix_clean_test = sci_RNA_Matrix_clean.copy(deep=True)
sci_ATAC_Matrix_clean_test = sci_ATAC_Matrix_clean.copy(deep=True)


In [257]:
# Creates a column with the row-sum for ease of later calculations
sci_RNA_Matrix_clean_test ["sum"] = sci_RNA_Matrix_clean_test .iloc[:,3:].sum(axis=1)
sci_ATAC_Matrix_clean_test["sum"] = sci_ATAC_Matrix_clean_test.iloc[:,3:].sum(axis=1)

# Divides each element in the row by the row sum, multiplies each value by 10,000
# Also adds a 1 to each value as a pseudocount before log transformation
sci_RNA_Matrix_clean_test .iloc[:, 3:-1] = np.log(sci_RNA_Matrix_clean_test .iloc[:, 3:-1].div(sci_RNA_Matrix_clean_test["sum"], axis=0)*10000 + 1)
sci_ATAC_Matrix_clean_test .iloc[:, 3:-1] = np.log(sci_ATAC_Matrix_clean_test .iloc[:, 3:-1].div(sci_ATAC_Matrix_clean_test["sum"], axis=0)*10000 + 1)

# Drop the Sum Column
sci_RNA_Matrix_clean_test.drop(columns=["sum"], inplace=True)
sci_ATAC_Matrix_clean_test.drop(columns=["sum"], inplace=True)


In [258]:
# Merging Datframes based on sample
# pd.concat([sci_RNA_Matrix, sci_ATAC_Matrix], axis=)
biModal_Matrix_clean = sci_RNA_Matrix_clean_test.merge(sci_ATAC_Matrix_clean_test, how="inner", on="sample")

# Dropping columns matrix_y, and treatment_time_y
biModal_Matrix_clean.drop(['matrix_y', 'treatment_time_y'], axis=1, inplace=True);

In [274]:
biModal_Matrix_clean

Unnamed: 0,sample,matrix_x,treatment_time_x,rna_2,rna_8,rna_11,rna_12,rna_15,rna_18,rna_20,rna_25,rna_31,rna_33,rna_35,rna_36,rna_37,rna_38,rna_39,rna_40,rna_41,rna_50,rna_51,rna_52,rna_54,rna_56,rna_57,rna_59,rna_60,rna_61,rna_62,rna_64,rna_65,rna_67,rna_68,rna_69,rna_70,rna_71,rna_72,rna_74,rna_75,rna_76,rna_78,rna_81,rna_86,rna_87,rna_88,rna_89,rna_90,rna_91,rna_92,rna_93,rna_94,rna_95,rna_96,rna_97,rna_98,rna_99,rna_100,rna_101,rna_102,rna_103,rna_104,rna_105,rna_106,rna_108,rna_109,rna_110,rna_111,rna_112,rna_114,rna_115,rna_116,rna_117,rna_118,rna_119,rna_120,rna_121,rna_122,rna_124,rna_125,rna_126,rna_127,rna_128,rna_130,rna_131,rna_132,rna_133,rna_134,rna_135,rna_136,rna_137,rna_138,rna_139,rna_140,rna_141,rna_142,rna_143,rna_145,rna_146,rna_147,rna_149,rna_150,rna_151,rna_152,rna_153,rna_154,rna_155,rna_156,rna_157,rna_158,rna_159,rna_160,rna_161,rna_162,rna_163,rna_164,rna_165,rna_166,rna_167,rna_168,rna_169,rna_171,rna_172,rna_173,rna_174,rna_176,rna_179,rna_180,rna_182,rna_188,rna_191,rna_192,rna_193,rna_196,rna_197,rna_198,rna_199,rna_200,rna_201,rna_202,rna_204,rna_207,rna_208,rna_210,rna_211,rna_212,rna_214,rna_215,rna_217,rna_218,rna_222,rna_225,rna_227,rna_229,rna_230,rna_231,rna_232,rna_233,rna_234,rna_236,rna_237,rna_238,rna_239,rna_240,rna_243,rna_244,rna_245,rna_247,rna_248,rna_249,rna_250,rna_251,rna_252,rna_253,rna_254,rna_256,rna_257,rna_258,rna_262,rna_264,rna_266,rna_267,rna_268,rna_269,rna_270,rna_271,rna_272,rna_274,rna_276,rna_277,rna_278,rna_279,rna_281,rna_282,rna_283,rna_285,rna_286,rna_287,rna_288,rna_289,rna_291,rna_292,rna_293,rna_294,rna_295,rna_296,rna_297,rna_298,rna_299,rna_301,rna_302,rna_303,rna_304,rna_306,rna_307,rna_309,rna_311,rna_312,rna_313,rna_315,rna_318,rna_319,rna_320,rna_321,rna_323,rna_324,rna_325,rna_327,rna_328,rna_329,rna_330,rna_331,rna_333,rna_334,rna_335,rna_336,rna_337,rna_338,rna_341,rna_342,rna_343,rna_345,rna_346,rna_347,rna_348,rna_349,rna_350,rna_351,rna_352,rna_353,...,atac_189349,atac_189350,atac_189351,atac_189352,atac_189353,atac_189354,atac_189355,atac_189356,atac_189357,atac_189358,atac_189359,atac_189360,atac_189361,atac_189362,atac_189363,atac_189364,atac_189365,atac_189366,atac_189367,atac_189368,atac_189370,atac_189372,atac_189373,atac_189374,atac_189375,atac_189376,atac_189377,atac_189378,atac_189379,atac_189380,atac_189381,atac_189382,atac_189383,atac_189384,atac_189385,atac_189386,atac_189387,atac_189388,atac_189389,atac_189391,atac_189392,atac_189393,atac_189394,atac_189395,atac_189396,atac_189397,atac_189398,atac_189399,atac_189400,atac_189401,atac_189402,atac_189403,atac_189404,atac_189405,atac_189406,atac_189407,atac_189408,atac_189409,atac_189410,atac_189412,atac_189413,atac_189414,atac_189415,atac_189416,atac_189417,atac_189418,atac_189419,atac_189420,atac_189421,atac_189422,atac_189423,atac_189424,atac_189425,atac_189426,atac_189427,atac_189428,atac_189429,atac_189430,atac_189431,atac_189432,atac_189433,atac_189434,atac_189435,atac_189436,atac_189437,atac_189438,atac_189439,atac_189440,atac_189441,atac_189442,atac_189443,atac_189444,atac_189445,atac_189446,atac_189447,atac_189448,atac_189449,atac_189450,atac_189451,atac_189452,atac_189453,atac_189454,atac_189455,atac_189456,atac_189457,atac_189458,atac_189459,atac_189460,atac_189461,atac_189462,atac_189463,atac_189464,atac_189465,atac_189466,atac_189467,atac_189468,atac_189469,atac_189470,atac_189471,atac_189472,atac_189473,atac_189474,atac_189475,atac_189476,atac_189477,atac_189478,atac_189480,atac_189481,atac_189482,atac_189483,atac_189484,atac_189485,atac_189486,atac_189487,atac_189488,atac_189489,atac_189490,atac_189491,atac_189492,atac_189493,atac_189494,atac_189495,atac_189496,atac_189497,atac_189498,atac_189499,atac_189500,atac_189501,atac_189502,atac_189503,atac_189504,atac_189505,atac_189506,atac_189507,atac_189508,atac_189509,atac_189510,atac_189511,atac_189512,atac_189513,atac_189514,atac_189515,atac_189516,atac_189517,atac_189518,atac_189519,atac_189520,atac_189521,atac_189522,atac_189523,atac_189524,atac_189525,atac_189526,atac_189527,atac_189528,atac_189529,atac_189530,atac_189531,atac_189532,atac_189533,atac_189534,atac_189535,atac_189536,atac_189537,atac_189538,atac_189539,atac_189540,atac_189541,atac_189542,atac_189543,atac_189544,atac_189545,atac_189546,atac_189547,atac_189548,atac_189549,atac_189550,atac_189551,atac_189552,atac_189553,atac_189554,atac_189555,atac_189556,atac_189557,atac_189558,atac_189559,atac_189560,atac_189561,atac_189562,atac_189563,atac_189564,atac_189565,atac_189566,atac_189567,atac_189568,atac_189569,atac_189570,atac_189571,atac_189572,atac_189573,atac_189574,atac_189575,atac_189576,atac_189577,atac_189578,atac_189579,atac_189580,atac_189581,atac_189582,atac_189583,atac_189584,atac_189585,atac_189586,atac_189587,atac_189588,atac_189589,atac_189590,atac_189591,atac_189592,atac_189593,atac_189594,atac_189595,atac_189596,atac_189597,atac_189598,atac_189599,atac_189600,atac_189601,atac_189602,atac_189603
0,sci-RNA-A-001.AAGTACGTTA,2,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,1.870492,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,1.870492,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.860741,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,1.870492,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,sci-RNA-A-001.CGTATTGAGA,16,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.190850,0.0,0.0,0.0,0.0,0.0,3.489552,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.190850,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2.190850,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,3.211990,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,3.211990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.826463,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.190850,0.000000,0.0,0.0,0.0,0.0,0.0,2.190850,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.337741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,sci-RNA-A-001.GACCAATGCG,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.468026,0.0,0.000000,2.468026,0.0,0.0,0.000000,2.468026,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.117873,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,2.468026,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.468026,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.468026,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,3.508478,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,sci-RNA-A-001.TCTCTCATCC,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.291769,1.291769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.823479,0.0,0.0,0.0,0.0,0.0,1.291769,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.291769,0.0,1.291769,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.837121,0.000000,0.0,0.000000,0.0,1.291769,0.0,0.0,1.291769,0.0,0.0,0.0,0.0,0.0,0.0,2.188033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,1.291769,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.837121,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.291769,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,1.291769,0.0,0.0,0.0,0.0,0.0,1.837121,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,sci-RNA-A-002.AAGTACGTTA,18,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,2.058039,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,2.058039,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.485076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.485076,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,1.485076,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,1.485076,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.343653,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,1.485076,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.485076,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,2.420004,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1987,sci-RNA-E-095.GGCATCTACC,6069,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.601486,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.188385,1.601486,0.0,0.000000,0.0,0.000000,0.0,0.0,2.188385,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,2.823851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.601486,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,1.601486,0.0,0.0,0.0,0.0,2.188385,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1988,sci-RNA-E-096.AATCGAACTC,6087,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.547767,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.547767,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2.128461,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.128461,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1.547767,0.0,0.0,0.0,1.547767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2.493444,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,2.128461,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.760254,1.547767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.637839,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,1.547767,0.000000,0.0,0.0,0.0,0.0,0.0,2.128461,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,2.128461,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1989,sci-RNA-E-096.CCTATCATAA,6081,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.891222,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.891222,0.0,0.0,0.0,0.0,0.0,2.505928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2.505928,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.891222,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.00000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.505928,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,1.891222,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1990,sci-RNA-E-096.CGAATCTCCT,6089,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,1.441456,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,2.632420,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,2.008710,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,2.368424,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.368424,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,2.00871,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.441456,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.225449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Saving the DataFrames as CSVs and HDFs

In [260]:
# # Minimizing RNA Matrix
# rnaColDict = {k:"float16" for k in sci_RNA_Matrix.columns[3:]}
# rnaColDict["treatment_time"] = 'int'
# sci_RNA_Matrix = sci_RNA_Matrix.astype(rnaColDict)

# # Minimizing ATAC Matrix
# atacColDict = {k:"float16" for k in sci_ATAC_Matrix.columns[3:]}
# atacColDict["treatment_time"] = 'int'
# sci_ATAC_Matrix["treatment_time"] = sci_ATAC_Matrix["treatment_time"].str.rstrip('h')
# sci_ATAC_Matrix = sci_ATAC_Matrix.astype(atacColDict)

# # Minimizing Bimodal Matrix
# biModalDict = {k:"float16" for k in biModal_Matrix.columns[3:]}
# biModalDict["treatment_time_x"] = 'int'
# biModal_Matrix = biModal_Matrix.astype(biModalDict)

In [261]:
sci_RNA_Matrix_clean_test.to_hdf("rnaMatrix_clean1",
                    key = 'stage',
                     mode='w')
sci_ATAC_Matrix_clean_test.to_hdf("atacMatrix1_clean1",
                       key = 'stage',
                      mode='w')
biModal_Matrix_clean.to_hdf("bimodalEarlyFusionMatrix_clean1",
                      key = 'stage',
                     mode='w')

***
# Using Data-Clean Up and Feature Selection Methods from Paper
***

In [None]:
# # Reading in the clean and scaled matrices from the R outputs, Sparse matrix style
# rnaCleanMat = sp.io.mmread("rnaRawClean")
# atacCleanMat = sp.io.mmread("atacRawClean")
# labels = pd.read_csv("treatTimelabels.csv")

In [None]:
# # Columns are cells, rows are genesR ex
# pd.DataFrame.sparse.from_spmatrix(rnaCleanMat)

## Loading in the data as CSVs

In [15]:
timeLabelsDf = pd.read_csv("treatTimelabels_upSampled.csv")
rnaRawDf = pd.read_csv("rnaRawClean_upSampled.csv")
atacRawDf = pd.read_csv("atacRawClean_upSampled.csv")
scAiDf = pd.read_csv("scAiOutput.csv")
scAiCellDf = pd.read_csv("scAi_CellLoading.csv")
scAiCell_20Factor_Df = pd.read_csv("scAi_CellLoad_20Factors_upSampled.csv")
scAiCell_20Factor_GeneLoading_Df = pd.read_csv("scAi_GeneLoad_20Factors_upSampled.csv")
scAiCell_20Factor_LocusLoading_Df = pd.read_csv("scAi_LocusLoad_20Factors_upSampled.csv")

In [194]:
rnaRawDf_test = pd.read_csv("rnaRawClean_upSampled.csv")

In [193]:
atacRawDf["treatTime"].value_counts()

3    1075
1    1075
0    1075
Name: treatTime, dtype: int64

In [159]:
# rnaRawDf = rnaRawDf.T
# rnaRawDf.columns = rnaRawDf.iloc[0,:]
# rnaRawDf.drop(rnaRawDf.index[0], inplace=True)
# rnaRawDf.reset_index(inplace=True)
# rnaRawDf.rename(columns={"index": "sample","Unnamed:0": "index"}, inplace=True)
# rnaRawDf.insert(1, "treatTime", timeLabelsDf.iloc[:,1])

In [16]:
# Reformatting all of the CSVs 

# Transposing the matrices

rnaRawDf = rnaRawDf.T
atacRawDf = atacRawDf.T
# scAiDf = scAiDf.T
# scAiCellDf = scAiCellDf.T
scAiCell_20Factor_Df = scAiCell_20Factor_Df.T

# Setting the genes/locii as the headers

# Raw RNA data
rnaRawDf.columns = rnaRawDf.iloc[0,:]
rnaRawDf.drop(rnaRawDf.index[0], inplace=True)
rnaRawDf.reset_index(inplace=True)
rnaRawDf.rename(columns={"index": "sample","Unnamed:0": "index"}, inplace=True)

# Raw ATAC data
atacRawDf.columns = atacRawDf.iloc[0,:]
atacRawDf.drop(atacRawDf.index[0], inplace=True)
atacRawDf.reset_index(inplace=True)
atacRawDf.rename(columns={"index": "sample"}, inplace=True)

#scAI Data - Aggregated Profile
# scAiDf.columns = scAiDf.iloc[0,:]
# scAiDf.drop(scAiDf.index[0], inplace=True)
# scAiDf.reset_index(inplace=True)
# scAiDf.rename(columns={"index": "sample","Unnamed:0": "index"}, inplace=True)


#scAI Data - Cell Load - 2 Factor
# scAiCellDf.columns = scAiCellDf.iloc[0,:]
# scAiCellDf.drop(scAiCellDf.index[0], inplace=True)
# scAiCellDf.reset_index(inplace=True)
# scAiCellDf.rename(columns={"index": "sample","Unnamed:0": "index"}, inplace=True)

#scAI Data - Cell Load - 20 Factor
scAiCell_20Factor_Df.columns = scAiCell_20Factor_Df.iloc[0,:]
scAiCell_20Factor_Df.drop(scAiCell_20Factor_Df.index[0], inplace=True)
scAiCell_20Factor_Df.reset_index(inplace=True)
scAiCell_20Factor_Df.rename(columns={"index": "sample","Unnamed:0": "index"}, inplace=True)

# Adding in the time of treatment labels
rnaRawDf.insert(1, "treatTime", timeLabelsDf.iloc[:,1])
atacRawDf.insert(1, "treatTime", timeLabelsDf.iloc[:,1])
# scAiDf.insert(1, "treatTime", timeLabelsDf.iloc[:,1])
# scAiCellDf.insert(1, "treatTime", timeLabelsDf.iloc[:,1])
scAiCell_20Factor_Df.insert(1, "treatTime", timeLabelsDf.iloc[:,1])

## Creating the Raw Bimodal Data

In [17]:
# Creating Raw Bimodal by merging the RNA and ATAC Raw datasets on sample
bimodalRawDf = rnaRawDf.merge(atacRawDf, how="inner", on=["sample", "treatTime"])

# Index 1187 is the first ATAC data, name 1-9963-10665

In [22]:
bimodalRawDf.shape
# bimodalRawDf.head()

(3225, 53948)

In [23]:
rnaRawDf.head()

Unnamed: 0,sample,treatTime,GPR153,ACOT7,ERRFI1,RP11-431K24.1,RERE,KAZN,TMEM51,IFFO2,NBL1,NBPF3,HSPG2,GALE,RUNX3,SFN,TRNP1,THRAP3,ZC3H12A,MACF1,HIVEP3,ERMAP,ST3GAL3,RNF220,ZCCHC11,ZYG11B,RP5-1024G6.8,LRP8,PPAP2B,NFIA,PGM1,ROR1,PDE4B,GADD45A,ST6GALNAC3,AK5,DNAJB4,FAM69A,FNBP1L,BCAR3,GCLM,VAV3,CELSR2,GSTM5,SLC16A1,RP11-389O22.1,MAGI3,PHTF1,ATP1A1,NOTCH2,NOTCH2NL,RP11-458D21.5,NBPF10,ITGA10,PIAS3,MCL1,SELENBP1,S100A10,IL6R,SHC1,ZBTB7B,ADAM15,ARHGEF2,KIRREL,PBX1,POU2F1,RP11-568K15.1,RALGPS2,TDRD5,C1orf21,PTGS2,PLA2G4A,KCNT2,CFH,MIR181A1HG,NR5A2,RNPEP,ELF3,PPP1R12B,SOX13,PIK3C2B,NUAK2,SRGAP2,PLXNA2,RCOR3,LPGAT1,KCTD3,TGFB2,CAPN8,EPHX1,ZNF678,NUP133,TAF5L,PCNXL2,MLK4,KCNK1,SLC35F3,LYST,MBOAT2,YWHAQ,RRM2,HPCAL1,LPIN1,MATN3,KLHL29,MFSD2B,SF3B14,FAM228B,TP53I3,DTNB,RAB10,FOSL2,PPP1CB,MEMO1,LTBP1,CRIM1,FEZ2,CDC42EP3,GALM,EML4,ZFP36L2,PLEKHH2,CAMKMT,RP11-89K21.1,PRKCE,EPAS1,RHOQ,TTC7A,CALM2,FOXN2,AC092839.3,KIAA1841,EHBP1,ANTXR1,AAK1,ANXA4,TMSB10,TCF7L1,RETSAT,ATOH8,ST3GAL5,LINC00152,INPP4A,LONRF2,NPAS2,AC092168.2,TBC1D8,FHL2,ST6GAL2,MIR4435-1HG,PTPN4,INHBB,TFCP2L1,CLASP1,GPR39,ZRANB3,LRP1B,KYNU,RND3,FMNL2,BAZ2B,TANK,COBLL1,CERS6,NOSTRIN,CHN1,NFE2L2,ITGAV,GULP1,COL5A2,INPP1,GLS,MYO1B,SDPR,ANKRD44,MARCH4,SMARCAL1,IGFBP2,PNKD,TMBIM1,ACSL3,DOCK10,IRS1,COL4A4,COL4A3,PID1,HJURP,ARL4C,SCLY,PER2,HDAC4,SNED1,LMCD1,SRGAP3,OGG1,CIDEC,HRH1,VGLL4,NUP210,TMEM43,UBE2E2,THRB,SLC4A7,RBMS3,CRTAP,STAC,ZNF445,LIMD1,CCDC12,NBEAL2,KIF9-AS1,PRKAR2A,LAMB2,USP4,PRKCD,ARHGEF3,SLMAP,PXK,ATXN7,VGLL3,DCBLD2,CCDC80,RP11-553L6.5,ZBTB20,RP11-197K3.1,LSAMP,IGSF11,GSK3B,HGD,ITGB5,SRPRB,AMOTL2,CEP70,PIK3CB,ZBTB38,RASA2,PLS1,PCOLCE2,TM4SF1,SERP1,RP11-454C18.2,RP11-64D22.2,AADAC,ARHGEF26,PLCH1,TIPARP,LINC00886,SKIL,PLD1,ECT2,...,SSH2,ASIC2,AP2B1,MMP28,ACACA,DDX52,HNF1B,IGFBP4,TNS4,ATP6V0A1,ETV4,TMUB2,GPATCH8,CTD-2020K17.1,MAP3K14,OSBPL7,CDK5RAP3,HOXB3,HOXB6,IGF2BP1,ITGA3,ABCC3,SPAG9,MSI2,VMP1,TANC2,ERN1,PRKCA,RP11-4F22.2,PITPNC1,BPTF,LINC00511,SLC39A11,SDK2,LLGL2,ITGB4,H3F3B,UNC13D,RNF157,RBFOX3,RNF213,BAIAP2,RP11-1055B8.7,SIRT7,MAFG,RAB40B,TGIF1,LAMA1,PTPRM,RAB12,SOGA2,LDLRAD4,GREB1L,CABLES1,RIOK3,ANKRD29,LAMA3,TTC39C,CDH2,DSG2,C18orf25,ZBTB7C,TCF4,ATP8B1,NEDD4L,PHLPP1,CTD-2354A18.1,MBP,FSTL3,ARID3A,MIDN,PLK5,GADD45B,NFIC,TJP3,PTPRS,GTF2F1,C3,ANGPTL4,DNM2,SPC24,KANK2,JUNB,IER2,ZSWIM4,LPHN1,PKN1,DNAJB1,NOTCH3,CYP4F12,TPM4,ARRDC2,GDF15,PBX4,GRAMD1A,SIPA1L3,SIRT2,ZFP36,ITPKC,CTC-490E21.12,PLAUR,RELB,VASP,OPA3,EHD2,FTL,LINC00085,ZNF611,MYADM,CACNG8,ZNF579,MRPS26,C20orf194,RNF24,SMOX,PLCB1,PLCB4,SNAP25,RRBP1,ZNF133,RIN2,THBD,ENTPD6,PYGB,ABHD12,ID1,BCL2L1,RP11-243J16.7,ACSS2,EDEM2,UQCC1,PHF20,SRC,VSTM2L,TGM2,RALGAPB,TOP1,PTPRT,ZNF335,CEBPB,BCAS4,BCAS1,CYP24A1,PMEPA1,NELFCD,CDH4,LAMA5,NRIP1,USP25,LINC00478,BACH1,AP000304.12,MRPS6,SETD4,SIM2,SIK1,AGPAT3,ATP6V1E1,CLTCL1,ZDHHC8P1,LRP5L,LIF,RNF215,SEC14L2,DEPDC5,LARGE,HMOX1,A4GALT,ARFGAP3,PACSIN2,FBLN1,ATXN10,FLJ27365,GRAMD4,TBC1D22A,PIM3,MID1,PIR,NHS,GPR64,SH3KBP1,RPS6KA3,SMS,PHEX,NR0B1,RP6-99M1.2,CHST7,RP2,PHF16,RP11-38O23.4,TSPYL2,PHF8,ZMYM3,FTX,RP3-368A4.5,BRWD3,POF1B,KLHL4,CENPI,TCEAL4,FAM199X,MORC4,RBM41,TSC22D3,COL4A6,COL4A5,TMEM164,LRCH2,ZNF280C,RP1-274L7.1,IGSF1,MAMLD1,NLGN4Y,TXLNG2P,AC011043.1,Fev,Zbtb37,Uck2,Nup210l,Kyat3,Gm31121,Sgip1,1700024P16Rik,Anapc4,Gm3289,Ckm,Fgfr2,Exph5,Stag1,Slc22a14,Eya4,Mgat4c,Tug1,Efcab5,Prkd1,Nrxn3,H2afy,Pde8b,Cdk7,1700112E06Rik,Plcxd3,Rnf19a,Xrcc6,Pi4ka,Smpd4,Map3k7cl,Fgf1,SOGA3,KIAA0391,PDXDC2P
0,sci_RNA_A_001_GACCAATGCG,0,0.0,0.0,0,0,2.47697,0.0,0.0,0.0,0,0,3.51795,0.0,0,0,2.47697,0.0,0,0.0,0,0,0,0,0.0,0.0,0,0.0,0,3.12721,0.0,2.47697,2.47697,0.0,0,0,0,0.0,2.47697,0,0,0.0,0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0,0.0,0,3.12721,0,0.0,0.0,2.47697,0,0,0.0,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,3.12721,0,0.0,0,2.47697,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0,0,0,3.12721,0.0,0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0,0.0,3.12721,2.47697,0.0,0.0,0,0.0,0,0,0,0.0,0.0,0.0,2.47697,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,2.47697,0.0,0,0.0,0,0,0,0.0,0,2.47697,0,0,0.0,2.47697,0.0,0,0.0,3.12721,0,0.0,0.0,0.0,0.0,0.0,2.47697,0,0,0,0,0.0,0.0,3.51795,3.12721,0,0,0.0,0,0,0,0.0,0,0,0,0.0,0.0,0.0,0,2.47697,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0,2.47697,0.0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,2.47697,0,0,0.0,0,0.0,0.0,0.0,0,3.12721,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0,0,0.0,0,0,0.0,2.47697,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,2.47697,0.0,0.0,0.0,0.0,3.12721,3.12721,0.0,0.0,2.47697,0,2.47697,0,0.0,3.12721,2.47697,0.0,0,0,2.47697,0,0,0,0.0,0.0,0.0,0,0.0,0,0.0,0.0,0,3.12721,0,0,2.47697,0.0,0,0,0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,3.12721,0.0,0,0,0,0,0.0,0,0,0,0.0,3.12721,0,0.0,0.0,0,0,0,0.0,0,0.0,0.0,2.47697,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0,0,0,0,0,0,0.0,0,0.0,2.47697,0.0,0,0,0,0,0,0.0,0,0,0,4.88175,0.0,2.47697,2.47697,0.0,0,0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0.0,0,0,0.0,0,0.0,0.0,0,0,0,2.47697,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,2.47697,0.0,0.0,0.0,0,0,0,0,3.79819,0,0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,2.47697,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.47697,0,0,0,0.0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,0,0,0
1,sci_RNA_A_001_TCTCTCATCC,0,1.2987,0.0,0,0,1.84515,0.0,1.2987,2.19651,0,0,0.0,1.84515,0,0,0.0,1.2987,0,1.2987,0,0,0,0,1.2987,1.84515,0,1.2987,0,1.2987,0.0,0.0,2.19651,0.0,0,0,0,1.2987,0.0,0,0,1.2987,0,0.0,0,0.0,1.84515,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,1.2987,0.0,0,0,2.19651,1.84515,0,1.2987,1.2987,0.0,0,0.0,0,1.2987,0.0,0,0.0,0,1.2987,0,1.2987,1.2987,0,1.84515,0.0,1.2987,0,0,0,1.84515,0,0,0,1.2987,1.2987,0,1.84515,1.2987,1.2987,0.0,1.2987,0,0.0,0,0,0,0,0,0,1.84515,2.66184,0.0,0.0,2.19651,0,0.0,0,0,0,1.2987,0.0,1.2987,2.97817,0.0,0.0,1.2987,1.2987,2.19651,0.0,1.2987,0.0,0,2.83246,0,1.84515,1.84515,0,0.0,0.0,0,0.0,0,0,0,1.84515,0,2.19651,0,0,0.0,0.0,1.84515,0,0.0,0.0,0,0.0,1.84515,0.0,1.2987,0.0,0.0,0,0,0,0,0.0,1.2987,1.2987,0.0,0,0,2.19651,0,0,0,1.2987,0,0,0,0.0,0.0,0.0,0,0.0,0.0,2.19651,1.84515,1.2987,0.0,0.0,0,0,0,1.2987,0,0,0.0,1.2987,0,1.2987,1.2987,0.0,0,0.0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,1.84515,0,0,2.66184,0,2.19651,1.2987,0.0,0,1.84515,0,0,0,1.84515,1.2987,1.2987,1.2987,0,0.0,0,0,1.2987,0,0,1.2987,0,0,0.0,1.2987,1.2987,...,1.2987,0,1.84515,0.0,1.2987,0.0,0,1.2987,0,0.0,0,0,1.2987,0,2.19651,2.19651,0.0,0.0,1.2987,1.84515,1.2987,0.0,0.0,2.19651,2.456,0.0,0,2.19651,0,1.2987,1.84515,3.49573,0.0,0,0,1.84515,0,0,0,1.84515,2.19651,0.0,0,3.64554,0,0.0,1.84515,0,2.97817,0,0,0.0,0.0,0,0,0,0.0,0,2.456,1.2987,0,0,0,0.0,1.84515,2.19651,1.84515,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0,0,0.0,1.2987,0,0.0,0,0,0,0,1.2987,0,0.0,0,0,0.0,0,1.84515,0.0,0,0,0,0,1.84515,0,0,0,0.0,2.456,0,0.0,0.0,0,0,0,1.2987,0,1.2987,0.0,1.2987,0.0,0.0,0.0,1.2987,0,0,0.0,0.0,1.84515,0,0,0,0,0,0,0.0,0,0.0,0.0,0.0,0,0,0,0,0,2.97817,0,0,0,1.84515,0.0,1.2987,1.2987,0.0,0,0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,1.84515,0.0,0,0,0.0,0,1.84515,1.2987,0,0,0,1.2987,1.84515,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,0.0,0.0,0.0,0,0,0,0,2.19651,0,0,0,0.0,0.0,0,0,0,0.0,1.2987,0.0,0,1.84515,1.2987,0.0,1.84515,1.2987,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,1.2987,0,1.84515,0,0.0,0,0.0,2.66184,1.2987,0,0,0,0,0
2,sci_RNA_A_002_ATCTAGGTTC,0,0.0,0.0,0,0,3.52967,0.0,0.0,2.03111,0,0,0.0,0.0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0,0.0,0,2.65641,2.03111,2.03111,2.03111,0.0,0,0,0,0.0,0.0,0,0,0.0,0,0.0,0,0.0,2.03111,0,0,0,0,0,0,0,0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,3.31382,0.0,0,2.03111,2.03111,0.0,0,2.65641,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0.0,2.03111,0.0,0,0.0,0,0,0,0,0,0,2.03111,2.03111,0.0,0.0,0.0,0,2.65641,0,0,0,0.0,0.0,0.0,0.0,3.85773,2.03111,0.0,0.0,0.0,0.0,0.0,0.0,0,2.03111,0,0.0,3.31382,0,0.0,0.0,0,0.0,0,0,0,2.03111,0,0.0,0,0,0.0,0.0,0.0,0,3.0382,0.0,0,0.0,0.0,0.0,0.0,2.03111,0.0,0,0,0,0,0.0,0.0,3.0382,0.0,0,0,2.03111,0,0,0,0.0,0,0,0,2.03111,3.0382,0.0,0,0.0,0.0,0.0,0.0,0.0,3.31382,2.65641,0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,3.0382,0,2.03111,0.0,0,0,0,0,0,0,0,0.0,0.0,0,0,0,3.31382,0,0,2.03111,0,0.0,0.0,0.0,0,2.03111,0,0,0,2.03111,0.0,0.0,0.0,0,2.03111,0,0,0.0,0,0,2.03111,0,0,0.0,2.65641,2.03111,...,2.03111,0,0.0,2.03111,2.65641,0.0,0,2.65641,0,0.0,0,0,0.0,0,0.0,0.0,2.03111,3.31382,2.03111,0.0,2.03111,2.03111,0.0,0.0,0.0,0.0,0,3.0382,0,0.0,0.0,2.65641,3.31382,0,0,0.0,0,0,0,0.0,2.03111,0.0,0,0.0,0,0.0,2.03111,0,3.0382,0,0,0.0,0.0,0,0,0,0.0,0,0.0,0.0,0,0,0,3.52967,0.0,0.0,0.0,0,0.0,0,2.03111,0,0,0.0,0,0.0,0.0,0,0,2.03111,0.0,0,0.0,0,0,0,0,0.0,0,0.0,0,0,0.0,0,0.0,0.0,0,0,0,0,0.0,0,0,0,0.0,0.0,0,0.0,0.0,0,0,0,0.0,0,3.0382,2.03111,0.0,0.0,0.0,2.03111,0.0,0,0,2.65641,0.0,2.03111,0,0,0,0,0,0,0.0,0,0.0,0.0,2.03111,0,0,0,0,0,3.31382,0,0,0,0.0,2.03111,0.0,0.0,0.0,0,0,0.0,0,0,0,0,0.0,0.0,0,0,0,0,0,0.0,0.0,0,0,2.65641,0,0.0,2.03111,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,2.65641,0.0,0.0,0.0,0,0,0,0,0.0,0,0,0,2.03111,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,0.0,2.03111,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,0.0,0,4.10434,0,0.0,2.03111,0.0,0,0,0,0,0
3,sci_RNA_A_003_ACGTTGAATG,0,0.0,0.0,0,0,0.0,1.5501,1.5501,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0.0,0,0,0,0,1.5501,0.0,0,1.5501,0,1.5501,0.0,0.0,2.13107,1.5501,0,0,0,1.5501,0.0,0,0,0.0,0,3.73365,0,1.5501,0.0,0,0,0,0,0,0,0,0,2.13107,0,1.5501,0,0.0,0.0,0.0,0,0,2.13107,0.0,0,0.0,0.0,0.0,0,2.76303,0,0.0,0.0,0,1.5501,0,1.5501,0,0.0,0.0,0,2.13107,1.5501,0.0,0,0,0,0.0,0,0,0,0.0,0.0,0,2.97347,0.0,0.0,2.13107,0.0,0,1.5501,0,0,0,0,0,0,0.0,2.13107,0.0,1.5501,2.49616,0,1.5501,0,0,0,0.0,0.0,0.0,0.0,0.0,1.5501,0.0,1.5501,0.0,0.0,1.5501,0.0,0,2.13107,0,0.0,0.0,0,0.0,2.13107,0,2.76303,0,0,0,2.13107,0,2.13107,0,0,2.49616,0.0,0.0,0,1.5501,0.0,0,1.5501,1.5501,1.5501,0.0,0.0,0.0,0,0,0,0,0.0,0.0,1.5501,2.76303,0,0,0.0,0,0,0,0.0,0,0,0,0.0,3.29523,1.5501,0,0.0,1.5501,0.0,1.5501,0.0,0.0,0.0,0,0,0,0.0,0,0,0.0,2.13107,0,0.0,0.0,1.5501,0,1.5501,0,0.0,0.0,0,0,0,0,0,0,0,1.5501,0.0,0,0,0,3.42412,0,0,0.0,0,0.0,3.14724,0.0,0,2.13107,0,0,0,0.0,1.5501,0.0,0.0,0,0.0,0,0,1.5501,0,0,0.0,0,0,2.13107,0.0,0.0,...,2.13107,0,0.0,0.0,2.13107,1.5501,0,2.13107,0,0.0,0,0,2.49616,0,0.0,0.0,0.0,0.0,0.0,1.5501,1.5501,1.5501,1.5501,0.0,0.0,0.0,0,0.0,0,1.5501,1.5501,1.5501,3.42412,0,0,0.0,0,0,0,0.0,1.5501,0.0,0,0.0,0,2.13107,2.13107,0,1.5501,0,0,0.0,0.0,0,0,0,1.5501,0,0.0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0.0,0,0,2.97347,0,2.49616,1.5501,0,0,1.5501,0.0,0,1.5501,0,0,0,0,0.0,0,0.0,0,0,1.5501,0,1.5501,1.5501,0,0,0,0,0.0,0,0,0,2.49616,2.49616,0,0.0,2.13107,0,0,0,0.0,0,0.0,0.0,1.5501,0.0,2.49616,0.0,1.5501,0,0,0.0,2.13107,0.0,0,0,0,0,0,0,0.0,0,1.5501,0.0,0.0,0,0,0,0,0,2.97347,0,0,0,0.0,1.5501,0.0,0.0,1.5501,0,0,1.5501,0,0,0,0,0.0,3.14724,0,0,0,0,0,0.0,1.5501,0,0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,1.5501,0,0,0,0,0,0,0,0.0,2.76303,1.5501,1.5501,0,0,0,0,3.42412,0,0,0,2.49616,0.0,0,0,0,0.0,0.0,0.0,0,2.76303,0.0,0.0,0.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.13107,0,0,0,0.0,0,0.0,0,0.0,0,1.5501,2.13107,0.0,0,0,0,0,0
4,sci_RNA_A_003_CCATCGGACC,0,0.0,2.71039,0,0,2.71039,0.0,0.0,0.0,0,0,0.0,0.0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0.0,0,0.0,0,2.71039,0.0,0.0,0.0,2.71039,0,0,0,0.0,0.0,0,0,0.0,0,0.0,0,0.0,2.08163,0,0,0,0,0,0,0,0,0.0,0,0.0,0,2.08163,0.0,0.0,0,0,0.0,0.0,0,0.0,0.0,2.08163,0,0.0,0,0.0,0.0,0,0.0,0,0.0,0,0.0,0.0,0,0.0,0.0,0.0,0,0,0,0.0,0,0,0,0.0,0.0,0,2.71039,0.0,0.0,0.0,0.0,0,0.0,0,0,0,0,0,0,2.08163,0.0,0.0,0.0,0.0,0,2.08163,0,0,0,0.0,2.08163,2.08163,2.08163,0.0,0.0,0.0,0.0,0.0,2.08163,0.0,2.71039,0,0.0,0,0.0,2.71039,0,0.0,0.0,0,0.0,0,0,0,3.36971,0,2.08163,0,0,0.0,0.0,0.0,0,0.0,0.0,0,0.0,3.09343,2.71039,0.0,0.0,0.0,0,0,0,0,2.08163,0.0,2.08163,2.08163,0,0,0.0,0,0,0,0.0,0,0,0,0.0,0.0,0.0,0,0.0,0.0,2.08163,0.0,0.0,0.0,0.0,0,0,0,0.0,0,0,0.0,0.0,0,0.0,0.0,0.0,0,2.08163,0,0.0,2.71039,0,0,0,0,0,0,0,0.0,2.08163,0,0,0,0.0,0,0,2.71039,0,2.71039,0.0,2.71039,0,0.0,0,0,0,0.0,0.0,0.0,0.0,0,0.0,0,0,0.0,0,0,2.08163,0,0,0.0,0.0,0.0,...,0.0,0,2.08163,0.0,0.0,0.0,0,2.08163,0,2.08163,0,0,0.0,0,0.0,0.0,0.0,0.0,0.0,2.08163,2.08163,4.16135,0.0,0.0,0.0,0.0,0,0.0,0,0.0,2.08163,2.08163,0.0,0,0,0.0,0,0,0,0.0,0.0,2.08163,0,3.76365,0,0.0,0.0,0,2.71039,0,0,0.0,2.08163,0,0,0,0.0,0,0.0,0.0,0,0,0,0.0,0.0,0.0,2.71039,0,2.71039,0,0.0,0,0,0.0,0,4.16135,0.0,0,0,0.0,0.0,0,0.0,0,0,0,0,0.0,0,2.08163,0,0,0.0,0,2.08163,3.36971,0,0,0,0,0.0,0,0,0,0.0,0.0,0,4.35918,2.71039,0,0,0,0.0,0,2.08163,0.0,0.0,2.08163,2.71039,0.0,2.08163,0,0,0.0,0.0,0.0,0,0,0,0,0,0,2.08163,0,0.0,0.0,2.08163,0,0,0,0,0,0.0,0,0,0,3.36971,2.08163,0.0,2.08163,0.0,0,0,0.0,0,0,0,0,2.08163,0.0,0,0,0,0,0,0.0,0.0,0,0,0.0,0,2.08163,0.0,0,0,0,0.0,0.0,0,0,0,0,0,0,0,0.0,0,0,0,0,0,0,0,0.0,3.58595,0.0,0.0,0,0,0,0,0.0,0,0,0,0.0,2.08163,0,0,0,2.08163,0.0,2.08163,0,0.0,0.0,2.71039,2.71039,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0.0,0,0,0,0.0,0,2.71039,0,0.0,0,0.0,0.0,2.08163,0,0,0,0,0


In [20]:
atacRawDf.shape

(3225, 52763)

## Creating the Low-Dim Unimodal Data

In [171]:
# Creating the RNA PCA
rnaRawData = rnaRawDf.iloc[:, 2:].values
pca = sk.decomposition.PCA(n_components=30)
rnaRawDataPca = pca.fit_transform(rnaRawData)
rnaRawData_PCA_30 = rnaRawDf.iloc[:, 0:2]
rnaRawData_PCA_30 = pd.concat([rnaRawData_PCA_30, pd.DataFrame(rnaRawDataPca)], axis=1, ignore_index=False)

In [172]:
# Creating the ATAC spPCA 30 components
# atacRawData = atacRawDf.iloc[:, 2:].values
# pca = sk.decomposition.SparsePCA(n_components=30)
# atacRawDataPca = pca.fit_transform(atacRawData)
# atacRawData_sparsePCA_30 = atacRawDf.iloc[:, 0:2]
# atacRawData_sparsePCA_30 = pd.concat([atacRawData_sparsePCA_30, pd.DataFrame(atacRawDataPca)], axis=1, ignore_index=False)

In [173]:
# Creating the ATAC SVD 50 components
atacRawData = atacRawDf.iloc[:, 2:].values
svd = sk.decomposition.TruncatedSVD(n_components=50)
atacRawDataSvd = svd.fit_transform(atacRawData)
atacRawData_Svd_50 = atacRawDf.iloc[:, 0:2]
atacRawData_Svd_50 = pd.concat([atacRawData_Svd_50, pd.DataFrame(atacRawDataSvd)], axis=1, ignore_index=False)

In [189]:
sum(svd.explained_variance_ratio_)

0.07919954460608458

In [182]:
print(atacRawData_Svd_50.shape)
atacRawData_Svd_50


# scAiCell_20Factor_Df
# scAiCell_20Factor_GeneLoading_Df
# # scAiCell_20Factor_LocusLoading_Df

# rnaRawData_PCA_30
# atacRawData_Svd_50

(3225, 52)


Unnamed: 0,sample,treatTime,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49
0,sci_RNA_A_001_GACCAATGCG,0,9.585339,0.233518,-0.030506,-0.342995,-0.795858,0.092793,0.977699,-0.931610,-1.666707,-2.032973,-0.050365,-0.478985,1.055325,-4.303805,0.693007,0.770551,1.924137,1.080441,1.421353,-0.625928,-2.373559,-1.298082,2.564325,0.707810,-2.574708,4.804321,3.418033,-2.038193,-0.540361,0.291637,2.116222,-3.977261,-2.167981,-2.252563,-4.197829,-0.936967,0.338181,5.752700,-2.410214,-1.619709,-1.460697,5.996328,1.676356,-0.541403,-1.337915,2.483392,4.263233,-4.407256,-5.001629,-2.185640
1,sci_RNA_A_001_TCTCTCATCC,0,9.855526,-2.120526,1.297493,0.453832,-1.777350,-1.607536,1.164355,-2.654126,-1.532110,-0.055146,0.316035,-1.630817,0.507466,-0.917533,0.050910,0.627386,1.072600,-0.266435,0.353536,-0.517581,-0.868924,0.350627,-0.922803,-0.243326,-1.024717,-0.838942,1.479427,0.822939,-0.787553,0.590953,-0.294196,1.686629,1.350144,0.917912,-0.112240,0.379116,-0.287822,-0.222613,-1.571273,-1.281675,0.119744,-0.071771,0.856845,1.106357,-0.335169,-1.206505,0.061790,0.829423,0.834915,-1.179211
2,sci_RNA_A_002_ATCTAGGTTC,0,6.586096,-3.145767,0.951345,0.297891,-0.537572,-0.048996,0.614263,-1.196434,0.081524,0.016126,0.236386,-0.617011,0.209994,-1.548357,-0.555075,0.049011,0.671139,-0.481498,-0.361203,-0.672145,0.616163,0.165225,-1.411719,0.244782,-0.665474,-0.586884,-0.214256,-1.224867,-0.554541,-0.546665,0.158893,0.623040,1.045955,1.040145,-0.621931,-0.057554,-0.450698,0.510035,-0.005220,0.093076,1.112106,-0.732796,-0.741471,0.388719,0.252041,0.640884,0.089146,0.192911,-0.027436,0.855848
3,sci_RNA_A_003_ACGTTGAATG,0,7.125086,-2.811981,-0.199463,0.599112,-0.040541,-0.116058,-0.802284,0.755146,0.058169,-0.040381,-0.141030,-0.155865,1.354270,0.570501,-0.130264,-0.386345,-0.004942,0.732485,-0.212849,-0.625814,-0.373659,-0.694044,-0.104617,-0.852918,-0.140031,-0.634605,0.836305,-0.196725,-0.167643,1.381207,-0.906423,0.750541,0.102134,1.018754,0.561484,-0.680553,1.651933,0.750038,0.554343,-0.467514,0.463401,0.374391,-0.554139,-0.673925,-0.721972,0.545200,-1.268381,0.462370,-0.623073,0.106438
4,sci_RNA_A_003_CCATCGGACC,0,5.396401,-4.609705,0.166552,0.061653,-0.220632,-0.218617,0.319811,-0.488002,-0.096298,-0.012794,-0.149564,-0.164945,-0.172229,-0.225204,-0.222723,0.006055,-0.246577,0.385533,-0.655536,0.015253,0.462830,0.563562,0.365930,-0.053416,-0.199103,-0.061094,0.105469,-0.707550,-0.073938,0.810242,-1.099415,0.426772,0.948925,0.848186,-0.762261,-0.542378,-0.204779,-0.719546,0.532376,-0.144933,-0.086726,-0.126648,0.065002,1.279269,-0.047812,0.240565,-0.604141,-0.090078,0.304357,-0.034862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3220,sci_RNA_E_095_ACGATAGACT,3,9.429755,-2.715672,0.344137,-0.410200,-0.643642,-0.746327,-0.157737,-0.554612,-0.150297,0.239186,-0.469389,0.088947,0.112778,-0.645101,0.309120,-0.102027,-0.098503,-0.988442,-0.466468,-0.002868,-0.120887,-0.148483,-0.183292,0.133565,0.104377,-0.439484,-0.082810,-0.284058,-0.030610,-0.014559,-0.556811,-0.388763,0.101959,-0.807933,0.649342,0.061879,-0.954888,0.419723,0.384076,-0.033095,-0.052806,-0.917685,-0.328354,0.607095,0.401965,0.300233,-0.143259,-0.938816,0.627593,0.136198
3221,sci_RNA_E_096_ATATGCCATC,3,5.193335,-2.478574,0.415912,0.369321,-0.454684,-0.657947,0.301662,-0.366334,-0.161728,-0.119791,-0.477618,0.255071,0.570233,0.090939,-0.154532,0.091768,-0.390970,0.840650,0.415112,0.392680,0.240840,0.257448,-0.131894,0.024991,-0.518048,0.631545,1.113879,0.275517,0.570420,0.814870,-1.486155,0.588930,0.710771,0.839289,0.058651,0.382598,-0.155209,0.005093,0.156396,-0.572225,1.263927,0.280260,0.001296,0.738298,-0.455430,0.287175,-0.120873,-0.168073,-0.661977,0.170019
3222,sci_RNA_E_096_CGAATCTCCT,3,9.089717,0.812854,0.390226,-0.193175,0.747647,0.139885,-0.205246,0.545309,-1.118046,-0.420605,0.164245,-0.485811,-0.903867,0.931871,0.542676,0.162852,0.131138,0.296574,0.214009,0.530873,0.414908,0.272807,0.401827,-0.424647,0.189571,0.956446,0.552470,0.013564,-0.582781,0.473717,-1.141697,0.271126,-0.153661,-0.468996,-0.447681,-0.260327,0.995904,-0.183221,-1.291431,-0.684534,-0.446372,1.358964,0.191019,-0.387504,-1.128310,-0.117325,-0.603916,0.371842,0.999971,0.025060
3223,sci_RNA_E_096_GGCGGTTGAC,3,11.485504,1.752600,-0.580506,0.149255,-0.032707,-0.193403,-0.964751,0.070650,0.480737,0.131352,0.135761,0.368171,0.535704,-0.509935,-0.123804,-0.641957,0.873115,-0.595364,0.607043,0.655546,0.452735,-0.854250,-0.371834,-0.527312,-0.510732,0.486340,-0.076455,-0.416724,0.171460,-0.942760,0.312198,1.355757,-0.458092,-0.493716,0.703094,-0.737786,0.122600,0.141721,0.638474,0.902802,2.131180,0.277481,0.296285,-0.945499,-0.417181,0.550036,0.027103,0.234755,-0.284281,0.449862


## Saving all of the Data

In [183]:
# Exporting the raw data, unimodal and bimodal

# sci_RNA_Matrix.to_hdf("rnaMatrix1",
#                       key = 'stage',
#                      mode='w')
# sci_ATAC_Matrix.to_hdf("atacMatrix1",
#                        key = 'stage',
#                       mode='w')
# biModal_Matrix.to_hdf("bimodalEarlyFusionMatrix1",
#                       key = 'stage',
#                      mode='w')

rnaRawDf.to_hdf("rnaRaw_upSampled",
                key = 'stage',
                mode = 'w')
atacRawDf.to_hdf("atacRaw_upSampled",
                 key = 'stage',
                 mode = 'w')

bimodalRawDf.to_hdf("bimodalRaw_upSampled",
               key = 'stage',
               mode = 'w')

# scAiDf.to_hdf("scAi_AggProfile",
#              key = 'stage',
#              mode = 'w')


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['sample', 'GPR153', 'ACOT7', 'ERRFI1', 'RP11-431K24.1', 'RERE', 'KAZN',
       'TMEM51', 'IFFO2', 'NBL1',
       ...
       'Plcxd3', 'Rnf19a', 'Xrcc6', 'Pi4ka', 'Smpd4', 'Map3k7cl', 'Fgf1',
       'SOGA3', 'KIAA0391', 'PDXDC2P'],
      dtype='object', name='Unnamed: 0', length=1186)]

  encoding=encoding,
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['sample', '1-9963-10665', '1-360366-360566', '1-525043-525243',
       '1-534190-534416', '1-563942-565496', '1-565954-566454',
       '1-567637-568137', '1-568415-569780', '1-713650-714682',
       ...
       'hs37d5-35343512-35343790', 'hs37d5-35365182-35365397',
       'hs37d5-35372168-35372717', 'hs37d5-35383159-35383390',
       'hs37d5-35391131-35391550', 'hs37d

In [184]:
# Exporting the scAI outputs

# scAiDf.to_hdf("scAi_AggProfile",
#              key = 'stage',
#              mode = 'w')

# 5 Factor

# scAiCellDf.to_hdf("scAi_CellLoad",
#                  key='stage',
#                  mode='w')

# 20 Factor version

scAiCell_20Factor_Df.to_hdf('scAI_CellLoad_20Factor_upSampled',
                           key='stage',
                           mode='w')

# 20 Factor Version Gene Loading
scAiCell_20Factor_GeneLoading_Df.to_hdf('scAI_RnaGeneLoad_20Factor_upSampled',
                           key='stage',
                           mode='w')

scAiCell_20Factor_LocusLoading_Df.to_hdf('scAI_AtacLocusLoad_20Factor_upSampled',
                           key='stage',
                           mode='w')
# 20 Factor Version Locus Loading



your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['sample', 'factor1', 'factor2', 'factor3', 'factor4', 'factor5',
       'factor6', 'factor7', 'factor8', 'factor9', 'factor10', 'factor11',
       'factor12', 'factor13', 'factor14', 'factor15', 'factor16', 'factor17',
       'factor18', 'factor19', 'factor20'],
      dtype='object', name='Unnamed: 0')]

  encoding=encoding,


In [185]:
# Exporting the PCA and SVD 

rnaRawData_PCA_30.to_hdf('rnaRaw_LowDim_pca30_upSampled',
                        key='stage',
                        mode='w')


atacRawData_Svd_50.to_hdf('atacRaw_LowDim_svd50_upSampled',
                         key='stage',
                         mode='w')

# atacRawData_sparsePCA_30.to_hdf('atacRaw_LowDim_spPCA30_upSampled',
#                          key='stage',
#                          mode='w')


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->axis0] [items->None]

  f(store)
your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->integer,key->block2_items] [items->None]

  f(store)


## Reading the HDF files to confirm

In [137]:
# pd.read_hdf('scAi_AggProfile')
# pd.read_hdf('scAI_CellLoad_20Factor')

In [138]:
# pd.read_hdf('rnaRaw')
# pd.read_hdf('atacRaw')
# pd.read_hdf('bimodalRaw')
# pd.read_hdf('bimodalLowDim_ScAi')