# Cross-Modality Prediction with Linear Regression

In [3]:
import numpy as np
import scanpy as sc
from sklearn.linear_model import LinearRegression, Ridge
from scipy.stats import pearsonr
from sklearn.metrics import mean_squared_error
import pandas as pd
import os
from self_supervision.paths import MULTIMODAL_FOLDER

### Load and Prepare Data

In [27]:
# adata = sc.read_h5ad(os.path.join(MULTIMODAL_FOLDER, "NeurIPS_multi_filtered_hvg_adata.h5ad"))
adata = sc.read_h5ad(os.path.join(MULTIMODAL_FOLDER, "NeurIPS_tfidf_filtered_hvg_adata.h5ad"))

adata

AnnData object with n_obs × n_vars = 69249 × 2000
    obs: 'GEX_pct_counts_mt', 'GEX_n_counts', 'GEX_n_genes', 'GEX_size_factors', 'GEX_phase', 'ATAC_nCount_peaks', 'ATAC_atac_fragments', 'ATAC_reads_in_peaks_frac', 'ATAC_blacklist_fraction', 'ATAC_nucleosome_signal', 'cell_type', 'batch', 'ATAC_pseudotime_order', 'GEX_pseudotime_order', 'Samplename', 'Site', 'DonorNumber', 'Modality', 'VendorLot', 'DonorID', 'DonorAge', 'DonorBMI', 'DonorBloodType', 'DonorRace', 'Ethnicity', 'DonorGender', 'QCMeds', 'DonorSmoker', 'split'
    var: 'feature_types', 'gene_id', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'
    obsm: 'atac'

In [28]:
adata.obsm['atac'].shape

(69249, 116490)

In [29]:
train_adata = adata[adata.obs['split'] == 'train']
test_adata = adata[adata.obs['split'] == 'test']

In [30]:
train_mRNA = np.asarray(train_adata.X.todense())
train_atac = train_adata.obsm['atac']

test_mRNA = np.asarray(test_adata.X.todense())
test_atac = test_adata.obsm['atac']

In [31]:
# Ensure the data is in dense format
train_mRNA = train_mRNA.toarray() if hasattr(train_mRNA, 'toarray') else train_mRNA
train_atac = train_atac.toarray() if hasattr(train_atac, 'toarray') else train_atac
test_mRNA = test_mRNA.toarray() if hasattr(test_mRNA, 'toarray') else test_mRNA
test_atac = test_atac.toarray() if hasattr(test_atac, 'toarray') else test_atac

### Fit Linear Model (with and without regularization)

In [32]:
train_atac.shape

(55398, 116490)

In [33]:
linear = LinearRegression()
linear.fit(train_mRNA, train_atac)

In [34]:
ridge = Ridge(copy_X=False)
ridge.fit(train_mRNA, train_atac)

### Evaluation Metric from NeurIPS Challenge

Essentially a Pearson Correlation

In [35]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")
    corrsum = 0
    for i in range(len(y_true)):
        corrsum += np.corrcoef(y_true[i], y_pred[i])[1, 0]
    return corrsum / len(y_true)

### Evaluation

In [36]:
linear_train_predictions = linear.predict(train_mRNA)
linear_test_predictions = linear.predict(test_mRNA)

ridge_train_predictions = ridge.predict(train_mRNA)
ridge_test_predictions = ridge.predict(test_mRNA)

In [37]:
# Calculate Test Mean Squared Error
train_mse = mean_squared_error(train_atac, linear_train_predictions)
test_mse = mean_squared_error(test_atac, linear_test_predictions)

# Calculate Test Pearson Correlation
train_corr = correlation_score(train_atac, linear_train_predictions)
test_corr = correlation_score(test_atac, linear_test_predictions)

print("Linear Train MSE:", train_mse)
print("Linear Train Correlation:", train_corr)
print("Linear Test MSE:", test_mse)
print("Linear Test Correlation:", test_corr)

Linear Train MSE: 0.395473450898623
Linear Train Correlation: 0.27767407257137994
Linear Test MSE: 0.4238630451872869
Linear Test Correlation: 0.16219452299615636


In [38]:
# Calculate Test Mean Squared Error
train_mse = mean_squared_error(train_atac, ridge_train_predictions)
test_mse = mean_squared_error(test_atac, ridge_test_predictions)

# Calculate Test Pearson Correlation
train_corr = correlation_score(train_atac, ridge_train_predictions)
test_corr = correlation_score(test_atac, ridge_test_predictions)

print("Ridge Train MSE:", train_mse)
print("Ridge Train Correlation:", train_corr)
print("Ridge Test MSE:", test_mse)
print("Ridge Test Correlation:", test_corr)

Ridge Train MSE: 0.39547340652406965
Ridge Train Correlation: 0.2776760653550196
Ridge Test MSE: 0.4238550043487647
Ridge Test Correlation: 0.16221148659388504
