In [None]:
import numpy as np
import pandas as pd
import sklearn.metrics
import sklearn.linear_model
import sklearn.decomposition

import os
import tqdm
import matplotlib.pyplot as plt

from src import Selector

chromosome_list = [str(i) for i in range(1, 23)] + ["X"]
hash_idx_max = {
    "1": 249000,  "2": 243000,  "3": 199000,  "4": 191000,
    "5": 182000,  "6": 171000,  "7": 160000,  "8": 146000,
    "9": 139000, "10": 134000, "11": 136000, "12": 134000,
    "13": 115000, "14": 108000, "15": 102000, "16":  91000,
    "17":  84000, "18":  81000, "19":  59000, "20":  65000,
    "21":  47000, "22":  51000,  "X": 157000,
}

# Profile

In [None]:
# { chromosome : (hash_idx, bucket_idx, distance) }
feature = Selector("data/feature").getFeature()
# remove distance and ndarray dtype from float to int
# { chromosome : (hash_idx, bucket_idx) }
feature = {c: np.array([f[:2] for f in feature[c]], dtype=int) for c in feature}
result = []
for c in chromosome_list:
    for hash_idx, bucket_idx in feature[c]:
        result.append((c, hash_idx*1000 + bucket_idx*100, hash_idx*1000 + (bucket_idx+1)*100))
result = pd.DataFrame(result, columns=["chromosome", "start", "end"])
result.to_csv("data/temp/feature.csv", index=False)

In [None]:
## bed
# read bed file
bed = pd.read_csv(
    "data/temp/rgc_gxs_v1_hg38.bed", 
    sep="\t", header=0, names=["chromosome", "start", "end"]
)
# remove chr prefix
bed["chromosome"] = bed["chromosome"].apply(lambda x: x[3:])
# keep chromosome 1-22 and X
bed = bed[bed['chromosome'].isin(chromosome_list)]
# transform bed from range list to one hot
bed_one_hot = {
    c: np.zeros((hash_idx_max[c] * 1000), dtype=bool) for c in chromosome_list
}
for _, row in bed.iterrows():
    chromosome, start, end = row[['chromosome', 'start', 'end']]
    bed_one_hot[chromosome][start:end] = True

## feature
# { chromosome : (hash_idx, bucket_idx, distance) }
feature = Selector("data/feature").getFeature()
# remove distance and ndarray dtype from float to int
# { chromosome : (hash_idx, bucket_idx) }
feature = {c: np.array([f[:2] for f in feature[c]], dtype=int) for c in feature}
# transform feature from bucket list to one hot
feature_one_hot = {
    c: np.zeros((hash_idx_max[c] * 1000), dtype=bool) for c in chromosome_list
}
for c in feature:
    for hash_idx, bucket_idx in feature[c]:
        feature_one_hot[c][
            hash_idx*1000 + bucket_idx*100 : hash_idx*1000 + (bucket_idx+1)*100
        ] = True

# calculate overlap
for c in bed_one_hot:
    overlap = np.logical_and(bed_one_hot[c], feature_one_hot[c])
    overlap_bed = (np.sum(overlap) / np.sum(bed_one_hot[c])) * 100 
    overlap_feature = (np.sum(overlap) / np.sum(feature_one_hot[c])) * 100
    print(
        f"Chromosome {c}:\t",
        f"overlap/bed: {np.sum(overlap):7,} / {np.sum(bed_one_hot[c]):9,} = {overlap_bed:5.2f}%\t",
        f"overlap/feature: {np.sum(overlap):7,} / {np.sum(feature_one_hot[c]):9,} = {overlap_feature:5.2f}%"
    )

In [None]:
p_val = 1e-3

snps = pd.read_csv("data/snps.csv", usecols=["Chr", "Pos", "Pval"])
# filter by pval < 1e-3
snps = snps[snps["Pval"] < p_val][["Chr", "Pos"]]
# sort by Chr and then Pos
snps = snps.sort_values(["Chr", "Pos"])
# transform chromosome from int 1-23 to str 1-22, X
snps["Chr"] = snps["Chr"].apply(lambda x: str(x) if x < 23 else "X")
# split by chromosome, { chromosome : (pos) }
snps = snps.groupby("Chr")["Pos"].apply(list).to_dict()

for c, pos in snps.items():
    snps_in_bed, snps_in_feature = 0, 0
    for p in pos:
        if bed_one_hot[c][p]: snps_in_bed += 1
        if feature_one_hot[c][p]: snps_in_feature += 1
    print(
        f"Chromosome {c}:\t",
        f"snps_in_bed/bed: {snps_in_bed:6,} / {len(pos):6,} = {(snps_in_bed / len(pos)) * 100:5.2f}%\t",
        f"snps_in_feature/feature: {snps_in_feature:6,} / {len(pos):6,} = {(snps_in_feature / len(pos)) * 100:5.2f}%"
    )

In [None]:
p_val = 1e-2

snps = pd.read_csv("data/snps.csv", usecols=["Chr", "Pos", "Pval"])
# filter by pval < 1e-2
snps = snps[snps["Pval"] < p_val][["Chr", "Pos"]]
# sort by Chr and then Pos
snps = snps.sort_values(["Chr", "Pos"])
# transform chromosome from int 1-23 to str 1-22, X
snps["Chr"] = snps["Chr"].apply(lambda x: str(x) if x < 23 else "X")
# split by chromosome, { chromosome : (pos) }
snps = snps.groupby("Chr")["Pos"].apply(list).to_dict()

for c, pos in snps.items():
    snps_in_bed, snps_in_feature = 0, 0
    for p in pos:
        if bed_one_hot[c][p]: snps_in_bed += 1
        if feature_one_hot[c][p]: snps_in_feature += 1
    print(
        f"Chromosome {c}:\t",
        f"snps_in_bed/bed: {snps_in_bed:7,} / {len(pos):7,} = {(snps_in_bed / len(pos)) * 100:5.2f}%\t",
        f"snps_in_feature/feature: {snps_in_feature:7,} / {len(pos):7,} = {(snps_in_feature / len(pos)) * 100:5.2f}%"
    )

# Visulization

In [25]:
import numpy as np
import pandas as pd

profile = pd.read_csv("data/profile.csv")
repre = []
for i in range(len(profile)):
    if not pd.isna(profile.loc[i, "repre_path"]):
        repre.append(np.load(profile.loc[i, "repre_path"]))
repre = np.vstack(repre)
# fit
pca = sklearn.decomposition.PCA(n_components=8).fit(repre)
# transform

(960, 131042)


In [None]:
pca1 = 1
pca2 = 3

# pca (497, 131042) to (497, 2)
pca = sklearn.decomposition.PCA(n_components=10).fit(tcgaskcm_repre_train)
print(pca.explained_variance_ratio_)
# plot pca 1 and pca 2, color by train_label
tcgaskcm_pca_train = pca.transform(tcgaskcm_repre_train)
plt.scatter(tcgaskcm_pca_train[:, pca1], tcgaskcm_pca_train[:, pca2], s=1)
plt.xlabel(f"PCA {pca1}")
plt.ylabel(f"PCA {pca2}")
plt.show()
tcgaskcm_pca_valid = pca.transform(tcgaskcm_repre_valid)
plt.scatter(tcgaskcm_pca_valid[:, pca1], tcgaskcm_pca_valid[:, pca2], s=1)
plt.xlabel(f"PCA {pca1}")
plt.ylabel(f"PCA {pca2}")
plt.show()