In [1]:
import os
import lightgbm as lgb

import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import seaborn as sns
import numpy as np
from colorama import Fore, Back, Style
from matplotlib.ticker import MaxNLocator
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler, scale
from sklearn.decomposition import PCA,TruncatedSVD
from sklearn.dummy import DummyRegressor
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_squared_error
import os, gc, pickle

In [2]:
DATA_DIR = "/Users/xzeng/Desktop/kaggle/open-problems-multimodal"

FP_CELL_METADATA = os.path.join(DATA_DIR,"metadata.csv")

FP_MULTIOME_TRAIN_INPUTS = os.path.join(DATA_DIR,"train_multi_inputs.h5")
FP_MULTIOME_TRAIN_TARGETS = os.path.join(DATA_DIR,"train_multi_targets.h5")
FP_MULTIOME_TEST_INPUTS = os.path.join(DATA_DIR,"test_multi_inputs.h5")

FP_SUBMISSION = os.path.join(DATA_DIR,"sample_submission.csv")
FP_EVALUATION_IDS = os.path.join(DATA_DIR,"evaluation_ids.csv")

In [100]:
def correlation_score(y_true, y_pred):
    """Scores the predictions according to the competition rules. 
    
    It is assumed that the predictions are not constant.
    
    Returns the average of each sample's Pearson correlation coefficient"""
    if type(y_true) == pd.DataFrame: y_true = y_true.values
    if type(y_pred) == pd.DataFrame: y_pred = y_pred.values
    if y_true.shape != y_pred.shape: raise ValueError("Shapes are different.")

    corr = np.corrcoef(y_true, y_pred)[1,0]
    return corr

In [5]:
df_cell = pd.read_csv(FP_CELL_METADATA)
df_cell_multi = df_cell[df_cell.technology=="multiome"]
df_cell_multi.shape

(161877, 5)

In [None]:
def selectHighlyVariableGenes(multi_train_y):
    adata = sc.AnnData(multi_train_y)
    sc.pp.highly_variable_genes(adata)
    hvg = adata.var[adata.var['highly_variable'] == True].index.to_list()
    
    return hvg

def selectExpressionMatrixByGeneName(multi_train_y,genes):
    multi_train_y_select = multi_train_y[hvg_test[0]]

In [61]:
multi_train_x = pd.read_hdf(FP_MULTIOME_TRAIN_INPUTS,start=0, stop=15000)

In [58]:
hvg_test = hvg[0:10]

In [59]:
gene_peak_dist = pd.read_csv('../results/gene_peak_dist_within200K.txt', sep='\t')

In [64]:
peak_selected = gene_peak_dist[gene_peak_dist['gene_name'] == hvg_test[0]].peak_name.tolist()
multi_train_x_select = multi_train_x[peak_selected].values

In [66]:
multi_train_x_select.shape

(15000, 35)

In [94]:
multi_train_y_select = multi_train_y[hvg_test[0]]
multi_train_y_select.shape

(15000,)

In [104]:
%%time
# Cross-validation

kf = KFold(n_splits=10, shuffle=True, random_state=1)
score_list = []
for fold, (idx_tr, idx_va) in enumerate(kf.split(multi_train_x_select)):
    model = None
    gc.collect()
    X_tr = multi_train_x_select[idx_tr] # creates a copy, https://numpy.org/doc/stable/user/basics.copies.html
    y_tr = multi_train_y_select[idx_tr]
    del idx_tr

    model = Ridge(copy_X=False,max_iter=15000)
    model.fit(X_tr, y_tr)
    del X_tr, y_tr
    gc.collect()

    # We validate the model
    X_va = multi_train_x_select[idx_va]
    y_va = multi_train_y_select[idx_va]
    del idx_va
    y_va_pred = model.predict(X_va)
    mse = mean_squared_error(y_va, y_va_pred)
    
    corrscore = correlation_score(y_va, y_va_pred)
    del X_va, y_va

    print(f"Fold {fold}: mse = {mse:.5f}, corr =  {corrscore:.3f}")
    score_list.append((mse, corrscore))

# Show overall score
result_df = pd.DataFrame(score_list, columns=['mse', 'corrscore'])
print(f"{Fore.GREEN}{Style.BRIGHT}{multi_train_x_select.shape} Average  mse = {result_df.mse.mean():.5f}; corr = {result_df.corrscore.mean():.3f}{Style.RESET_ALL}")

Fold 0: mse = 0.67023, corr =  -0.052
Fold 1: mse = 0.70060, corr =  -0.008
Fold 2: mse = 0.88981, corr =  0.004
Fold 3: mse = 0.77580, corr =  -0.023
Fold 4: mse = 0.99078, corr =  -0.020
Fold 5: mse = 0.71676, corr =  0.000
Fold 6: mse = 0.74280, corr =  -0.024
Fold 7: mse = 0.74884, corr =  0.006
Fold 8: mse = 0.78747, corr =  0.018
Fold 9: mse = 0.78049, corr =  0.015
[32m[1m(15000, 35) Average  mse = 0.78036; corr = -0.008[0m
CPU times: user 59.9 s, sys: 1.1 s, total: 1min 1s
Wall time: 5.96 s
