In [None]:
PROJECT_NAME = "reverse-gene-finder"

In [None]:
import os
PROJECT_HOME = os.path.join("/content/drive/My Drive/Projects", PROJECT_NAME)

import sys
sys.path.append(PROJECT_HOME)

In [None]:
# Google Drive storage setup
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import joblib
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits=5)

In [None]:
selected_labels = ['nonAD', 'earlyAD']
selected_cell_clusters = [10]
selected_brain_regions = ['PFC']

In [None]:
data_dir = os.path.join(PROJECT_HOME, "data")
metadata_path = os.path.join(data_dir, "metadata.csv")
gene_info_path = os.path.join(data_dir, "gene_info.csv")

In [None]:
gene_info = pd.read_csv(gene_info_path)
print("# of genes: %d" % len(gene_info))

metadata = pd.read_csv(metadata_path)
subject_label_dict = {}
subject_ids = metadata['subject'].unique().tolist()
labels = metadata['ADdiag3types'].unique().tolist()
brain_regions = metadata['brainRegion'].unique().tolist()

for _, row in metadata.iterrows():
    subject_id = row['subject']
    diagnosis_label = row['ADdiag3types']
    subject_label_dict[subject_id] = diagnosis_label

In [None]:
metadata = pd.read_csv(metadata_path)
metadata = metadata[~metadata['subject'].isnull()]
gene_info = pd.read_csv(gene_info_path)
gene_ids = gene_info['gene_id'][~gene_info['gene_id'].isnull()].unique()
print("# of selected genes: %d" % len(gene_ids))

subject_label_dict = {}
sample_label_dict = {}
for idx, row in metadata.iterrows():
    subject_id = row['subject']
    diagnosis_label = row['ADdiag3types']
    subject_label_dict[subject_id] = diagnosis_label
    sample_label_dict[idx] = diagnosis_label

subject_ids = metadata['subject'].unique().tolist()

selected_subject_ids = []
for selected_label in selected_labels:
    selected_group_ids = [subject_id for subject_id in subject_ids if subject_label_dict[subject_id] == selected_label]
    print("# of %s subjects: %d" % (selected_label, len(selected_group_ids)))
    selected_subject_ids += selected_group_ids

selected_sample_ids = []
for idx, row in metadata.iterrows():
    subject_id = row['subject']
    cluster = int(row['seurat_clusters'])
    if selected_cell_clusters is not None and cluster not in selected_cell_clusters:
        continue
    brain_region = row['brainRegion']
    if selected_brain_regions is not None and brain_region not in selected_brain_regions:
        continue
    n_feature_RNA = row['nFeature_RNA']
    percent_mt = row['percent.mt']
    if n_feature_RNA < 200 or n_feature_RNA > 2500:
        continue
    if percent_mt > 5:
        continue
    if subject_id in selected_subject_ids:
        selected_sample_ids.append(idx)

# of selected genes: 15549
# of nonAD subjects: 219
# of earlyAD subjects: 132


In [None]:
os.makedirs(os.path.join(PROJECT_HOME, "data", "id_splits"), exist_ok=True)
for i, (train_index, test_index) in enumerate(skf.split(selected_sample_ids, [sample_label_dict[idx] for idx in selected_sample_ids])):
    print(f"Fold {i}:")
    train_sample_ids = [selected_sample_ids[idx] for idx in train_index]
    training_sample_ids, validation_sample_ids = train_test_split(train_sample_ids, test_size=0.2, random_state=42,
                                                                  stratify=[sample_label_dict[idx] for idx in train_sample_ids])
    testing_sample_ids = [selected_sample_ids[idx] for idx in test_index]

    print("# of training samples: %d" % len(training_sample_ids))
    print("# of validation samples: %d" % len(validation_sample_ids))
    print("# of testing samples: %d" % len(testing_sample_ids))

    joblib.dump((training_sample_ids, validation_sample_ids, testing_sample_ids), os.path.join(PROJECT_HOME, "data", "id_splits", "split_%d.joblib" % i))

Fold 0:
# of training samples: 489
# of validation samples: 123
# of testing samples: 153
Fold 1:
# of training samples: 489
# of validation samples: 123
# of testing samples: 153
Fold 2:
# of training samples: 489
# of validation samples: 123
# of testing samples: 153
Fold 3:
# of training samples: 489
# of validation samples: 123
# of testing samples: 153
Fold 4:
# of training samples: 489
# of validation samples: 123
# of testing samples: 153
