# PSN

In [250]:
import os
import anndata as ad
import scipy as sp

In [252]:
data_path = os.path.abspath(os.path.join(os.getcwd(),"../data"))
pseudobulk = os.path.join(data_path, "pb")
corr_patients = os.path.join(data_path, "corr/patients")
corr_diseases = os.path.join(data_path, "corr/diseases")
original = os.path.join(data_path, "original")

## Similarity Measures

In [None]:
"""
Measure of similarity between two non-zero vectors defined in an inner product space. Cosine similarity is the cosine of the angle between the vectors.
It is the dot product of the vectors divided by the product of their lengths. 

Cosine similarity is defined as :
- (cosine distance - 1) 

input: patient1, patient2 : 1D arrays
output: cosine similarity: double value in range [-1,1]
"""
def cosine_sim(patient1, patient2): 
    return sp.spatial.distance.cosine(patient1, patient2)

In [None]:
"""
Measures the linear relationship between two datasets and performs a test of the null hypothesis that the distributions underlying the samples are uncorrelated and normally distributed. 

input: patient1, patient2 : 1D arrays
output: a PearsonRResult object with the following attributes
    statistic : float in range [-1,1]
    pvalue : float
"""
def pearson_corr(patient1, patient2):
    return sp.stats.pearsonr(patient1, patient2).statistic

In [None]:
#def mean_of_normalized_diffs(patient1, patient2):


## Computing pairwise similiarities
Every row is a patient 

### Correlatin

In [261]:
for file in os.listdir(pseudobulk):
    filename = os.fsdecode(file)
    if filename == ".DS_Store": # To avoid an error that occurs when this file is read instead of the excel sheets
        continue

    anndata_obj = ad.read_excel(os.path.join(pseudobulk, filename), "Sheet1", dtype=str)
    
    if filename.find("scRNA") == 0:
        corr = anndata_obj.to_df().set_index("scRNASeq_sample_ID").T.corr() # Compute correlation from pseudobulks 
        orig = ad.read_h5ad(os.path.join(original, "scRNA.h5ad")) # Read the original data 
        orig.obs_names = orig.obs.scRNASeq_sample_ID 
        diseases = orig.obs.DiseaseClassification # Extract the diseases
        diseases = diseases[~diseases.index.duplicated(keep='first')] # Keep unique disease-patient pairs 
        diseases = corr.join(diseases, how ="inner")["DiseaseClassification"] # Map the correlations to diseases based on patients    

    elif filename.find("adt") == 0:
        corr = anndata_obj.to_df().set_index("scRNASeq_sample_ID").T.corr()
        orig = ad.read_h5ad(os.path.join(original, "adt.h5ad"))
        orig.obs_names = orig.obs.scRNASeq_sample_ID
        diseases = orig.obs.DiseaseClassification
        diseases = diseases[~diseases.index.duplicated(keep='first')]
        diseases = corr.join(diseases, how ="inner")["DiseaseClassification"]
        
    elif filename.find("cytof") == 0:
        corr = anndata_obj.to_df().set_index("patient_id").T.corr()
        orig = ad.read_h5ad(os.path.join(original, "cytof_full.h5ad"))
        diseases = orig.uns["experiment_info"][["patient_id","condition"]].set_index("patient_id") # Extract the diseases
        diseases = corr.join(diseases, how ="inner" )["condition"]
        
    elif filename.find("luminex") == 0:
        corr = anndata_obj[:,5:].to_df().T.corr()
        orig = ad.read_excel(os.path.join(original, "Oxford data_output_combined.xlsx"), "All data", dtype=str)
        diseases = orig.to_df()["severity"] # Extract the diseases
        diseases = corr.join(diseases, how ="inner" )["severity"]

    elif filename.find("facs") == 0:
        corr = anndata_obj.to_df().T.corr()
        orig = ad.read_h5ad(os.path.join(original, "facs_full.h5ad"))
        diseases = orig.uns["experiment_info"][["scRNASeq_sample_ID","condition"]].set_index("scRNASeq_sample_ID") # Extract the diseases
        diseases = corr.join(diseases, how ="inner" )["condition"]

    else:
        corr = anndata_obj.to_df().T.corr()
        diseases = None
        
    corr.to_excel(os.path.join(corr_patients, filename))

    if diseases is not None: # bulkRNA has no disease information 
        corr.rename(columns=diseases,index=diseases, inplace=True) # Rename rows and columns of the correlation dataframe to the disease names 
        corr.sort_index(inplace = True) # Sort the row names
        corr.sort_index(inplace = True, axis = 1) # Sort the column names
        corr.to_excel(os.path.join(corr_diseases, filename)) 


  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['S00109-Ja001E-PBCa', 'S00112-Ja003E-PBCa', 'G05153-Ja005E-PBCa', 'S00005-Ja005E-PBCa', 'S00061-Ja003E-PBCa']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['S00109-Ja001E-PBCa', 'S00112-Ja003E-PBCa', 'G05153-Ja005E-PBCa', 'S00005-Ja005E-PBCa', 'S00061-Ja003E-PBCa']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to contain strings, but got values like:
    ['S00109-Ja001E-PBCa', 'S00112-Ja003E-PBCa', 'G05153-Ja005E-PBCa', 'S00005-Ja005E-PBCa', 'S00061-Ja003E-PBCa']

    Inferred to be: categorical

  names = self._prep_dim_index(names, "obs")
  utils.warn_names_duplicates("obs")
AnnData expects .obs.index to

## Exploration