# CellChat RAW data creation

### 1. Import Required Packages
### 2. Import Prior Clustered Data
### 3. Prep and Export Data


## <br> 1. Import Required Packages

In [1]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import anndata as ad
import seaborn as sns
import gseapy as gp


from scipy import sparse
import scipy.io as sio
from anndata import AnnData
from anndata.experimental.multi_files import AnnCollection

## <br> 2. Import Prior Clustered Data

In [2]:
adata =  ad.read_h5ad('../01_Libraries_Integration_and_Analysis---scVI/Adata_Objects/01f_AllGenes_Clustered_And_Annotated_Cells.h5ad')


In [3]:
print(adata.raw)

Raw AnnData with n_obs × n_vars = 263070 × 22917


In [4]:
print(adata) 

AnnData object with n_obs × n_vars = 263070 × 22917
    obs: 'Dose', 'Time', 'OrigIdent', 'Celltype', 'Cell_Subtype', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'n_genes', 'Barcode', 'OrigIdent-Barcode', 'Seurat_Celltype', 'doublet_scores', 'predicted_doublets', 'doublet_info', 'leiden_scVI_res_0.05', 'leiden_scVI_res_0.1', 'leiden_scVI_res_0.25', 'leiden_scVI_res_0.5', 'leiden_scVI_res_0.75', 'leiden_scVI_res_1.50', 'OrigIdent_Celltype', 'Dose_Time'
    uns: 'Cell_Subtype_colors', 'Celltype_Wilcoxon', 'Celltype_colors', 'Dose_colors', 'OrigIdent_colors', 'Time_colors', 'doublet_info_colors', 'leiden', 'leiden_scVI_res_0.25_colors', 'leiden_scVI_res_0.5_colors', 'log1p', 'neighbors', 'umap'
    obsm: 'X_scVI', 'X_umap'
    layers: 'counts', 'log1p_counts'
    obsp: 'connectivities', 'distances'


## <br> 3. Prep and Export Data

In [5]:
matrix = adata.X.copy()
transposed_matrix = matrix.transpose()


In [6]:
# Save the matrix in Matrix Market (.mtx) format
sio.mmwrite('./Data/TC_Sparse_Matrix.mtx', transposed_matrix)

In [7]:
metadata = adata.obs[['OrigIdent-Barcode','OrigIdent_Celltype', 'OrigIdent', 'Dose', 'Time', 'Dose_Time', 
                      'Celltype','Cell_Subtype',
                     'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 
                     'n_genes', 'Barcode', 'doublet_scores', 'predicted_doublets', 'doublet_info']]


# Export the DataFrame as a TSV file
metadata.to_csv('./Data/metadata.tsv', sep='\t', index=False)

In [8]:
features = pd.DataFrame(adata.var.index)

# Export features as TSV without index, quotes, or header
features.to_csv('./Data/features.tsv', sep='\t', index=False, header=False)

In [9]:
barcodes = adata.obs[['OrigIdent-Barcode']]

# Export features as TSV without index, quotes, or header
barcodes.to_csv('./Data/barcodes.tsv', sep='\t', index=False, header=False)

In [10]:
adata.obs['OrigIdent-Barcode']

0         L001-AAACCCAAGTACAGAT
1         L001-AAACCCAAGTCGAATA
2         L001-AAACCCACATCCGAGC
3         L001-AAACCCACATCCTTCG
4         L001-AAACCCAGTAGAGCTG
                  ...          
263069    L138-TTTGTTGCACATAACC
263070    L138-TTTGTTGCACATGAAA
263071    L138-TTTGTTGGTCTCGGAC
263072    L138-TTTGTTGTCGGTAGGA
263073    L138-TTTGTTGTCTCTAAGG
Name: OrigIdent-Barcode, Length: 263070, dtype: object

In [11]:
# Count the duplicate values
duplicates_count = adata.obs['OrigIdent-Barcode'].value_counts()

# Filter values with count greater than 1
duplicates_greater_than_1 = duplicates_count[duplicates_count > 1]

print(duplicates_greater_than_1)

Series([], Name: count, dtype: int64)
