# Exporting Diaz2019 CSV Files

Code for exporting CSV files

### Import Packages

In [1]:
# Import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scanpy as sc
import scipy.sparse
import scrublet as scr
import os
import sys
import pickle

sys.path.append('../')
import helper_functions_dew as dew

In [2]:
# ScanPy settings
sc.settings.verbosity = 0  # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=120, dpi_save=600, vector_friendly=False)  
sc.logging.print_versions()

# Matplotlib settings
plt.rcParams['pdf.fonttype']=42  # Necessary to export text (rather than curves) in pdf files

scanpy==0+unknown anndata==0.6.19 umap==0.3.8 numpy==1.16.2 scipy==1.2.1 pandas==0.24.2 scikit-learn==0.20.3 statsmodels==0.9.0 python-igraph==0.7.1 louvain==0.6.1 


In [3]:
# Define a function for checking that raw counts matrices are integers...this approximately 
# *halves* the size of the final X.csv file
def safe_convert_to_int(x):
    if np.all(np.mod(x, 1) == 0):
        print('All elements are verified integers, converting to int array')
        return x.astype(int)
    else:
        print('Some elements are not integers, doing nothing')
        return x

### mmE9.5

In [4]:
# Load the doublet-filtered version of the dataset and revert to 'raw.X'
mmE95_df = sc.read('../mmE95/Diaz2019_mmE95df.h5ad')
mmE95_df.X = safe_convert_to_int(mmE95_df.raw.X.todense())
mmE95_df.write_csvs('CSV_191007/mmE95', skip_data=False)
print(mmE95_df)

All elements are verified integers, converting to int array


writing '.csv' files to CSV_191007/mmE95


AnnData object with n_obs × n_vars = 4367 × 40523 
    obs: 'batch', 'cell_names', 'library_id', 'n_counts', 'unique_cell_id', 'doublet_scores', 'predicted_doublets', 'n_genes', 'n_counts_pre_norm', 'leiden', 'louvain'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'leiden', 'leiden_colors', 'louvain', 'louvain_colors', 'neighbors', 'pca'
    obsm: 'X_pca', 'X_tsne', 'X_umap'
    varm: 'PCs'


### mmESC

In [5]:
# For raw data matrix: load the original 'loaded' version of the dataset, repeat pp.filter_cells
mmESC = sc.read('../mmES/Diaz2019_mmES_loaded.h5ad')
sc.pp.filter_cells(mmESC, min_genes=250)
mmESC.X = safe_convert_to_int(mmESC.X.todense())
mmESC.write_csvs('CSV_191007/mmESC_X', skip_data=False)
print(mmESC)

# For annotations, use the 'analyzed' version
mmESC = sc.read('../mmES/Diaz2019_mmES_analyzed.h5ad')
mmESC.write_csvs('CSV_191007/mmESC', skip_data=True)
print(mmESC)

All elements are verified integers, converting to int array


writing '.csv' files to CSV_191007/mmESC_X


AnnData object with n_obs × n_vars = 21478 × 40523 
    obs: 'batch', 'cell_names', 'library_id', 'n_counts', 'unique_cell_id', 'time_id', 'sample_id', 'n_genes'


writing '.csv' files to CSV_191007/mmESC


AnnData object with n_obs × n_vars = 21478 × 40523 
    obs: 'batch', 'cell_names', 'library_id', 'n_counts', 'unique_cell_id', 'time_id', 'sample_id', 'n_genes', 'n_counts_pre_norm', 'leiden', 'louvain', 'pr_NearestNeighbors', 'pr_RandomForest', 'pr_NeuralNet', 'pr_LDA', '_', 'PAGAFlag', 'dpt_pseudotime'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    uns: 'PAGAFlag_colors', 'PAGAFlag_sizes', 'diffmap_evals', 'draw_graph', 'iroot', 'leiden', 'leiden_colors', 'library_id_colors', 'louvain', 'louvain_colors', 'louvain_sizes', 'neighbors', 'paga', 'pca', 'pr_LDA_colors', 'pr_NearestNeighbors_colors', 'pr_NeuralNet_colors', 'pr_RandomForest_colors', 'rank_genes_groups', 'time_id_colors'
    obsm: 'X_pca', 'X_draw_graph_fa', 'proba_NearestNeighbors', 'proba_RandomForest', 'proba_NeuralNet', 'proba_LDA', 'X_diffmap'
    varm: 'PCs'


### hsIPSC

In [6]:
# For raw data: use original 'loaded' version of the dataset, repeat pp.filter_cells and pp.filter_genes
hsIPSC = sc.read('../hsIPS/Diaz2019_hsiPS_loaded.h5ad')
sc.pp.filter_cells(hsIPSC, min_genes=250)
sc.pp.filter_genes(hsIPSC, min_cells=1)
hsIPSC.X = safe_convert_to_int(hsIPSC.X.todense())
hsIPSC.write_csvs('CSV_191007/hsIPSC_X', skip_data=False)
print(hsIPSC)

# For annotations, use the 'full' version
hsIPSC = sc.read('../hsIPS/Diaz2019_hsiPS_full.h5ad')
hsIPSC.write_csvs('CSV_191007/hsIPSC', skip_data=True)
print(hsIPSC)

All elements are verified integers, converting to int array


writing '.csv' files to CSV_191007/hsIPSC_X


AnnData object with n_obs × n_vars = 14750 × 24338 
    obs: 'batch', 'cell_names', 'library_id', 'n_counts', 'unique_cell_id', 'time_id', 'n_genes'
    var: 'n_cells'


writing '.csv' files to CSV_191007/hsIPSC


AnnData object with n_obs × n_vars = 14750 × 24338 
    obs: 'batch', 'cell_names', 'library_id', 'n_counts', 'unique_cell_id', 'time_id', 'n_genes', 'n_counts_pre_norm', 'S_score', 'G2M_score', 'phase', 'leiden', 'louvain', 'pr_NearestNeighbors', 'pr_RandomForest', 'pr_NeuralNet', 'pr_LDA', '_', 'PAGAFlag', 'dpt_pseudotime'
    var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'n_cells'
    uns: 'PAGAFlag_colors', 'PAGAFlag_sizes', 'diffmap_evals', 'draw_graph', 'iroot', 'leiden', 'leiden_colors', 'library_id_colors', 'louvain', 'louvain_colors', 'neighbors', 'paga', 'pca', 'time_id_colors'
    obsm: 'X_pca', 'X_draw_graph_fa', 'proba_NearestNeighbors', 'proba_RandomForest', 'proba_NeuralNet', 'proba_LDA', 'X_diffmap'
    varm: 'PCs'
