#Toy file generation for eQTL testing

In [1]:
import scanpy as sc
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import itertools
# from pybedtools import BedTool

import statsmodels.formula.api as smf
import statsmodels.api as sm
import imp

import os
import pickle as pkl
%matplotlib inline

  import imp


In [2]:
import sys
sys.path.append('/home/ubuntu/Github/scrna-parameter-estimation/')
import memento

In [3]:
data_path  = '/data_volume/lupus/'

## Read the inputs: variables of interest (SNPs), covariates, SNP-gene pairs.

For each of these SNP and covariate, each row is an individual and columns are different variables of interest. 

For the tutorial, we use the genotypes and covariates used in 2022 Perez, Gordon, Subramaniam et al. paper from the lab. These inputs are identical to Matrix eQTL inputs - I just transpose them here because I think it makes more sense that observations are rows...

For the tutorial, we just setup some random SNP-gene pairs to test; however, you can flexibly design this mapping to fit your needs. I purposefully didn't encode all possible variations of how you can define gene-SNP relationships.

#### Make toy gene-SNP pairs

In [20]:
# You can define this mapping DataFrame however you want - for example, you can take find gene-SNP pairs via looking for a 100kb.
# Here, to make the tutorial faster, we'll just randomly take 50k lines.
gene_snp_pairs = pd.read_csv(data_path + 'mateqtl_input/{}/gene_snp_pairs_hg19_100kb.csv'.format(pop))
gene_snp_pairs.columns = ['gene', 'SNP']
gene_snp_pairs = gene_snp_pairs.query('SNP in @snps.columns').sample(50)

In [22]:
selected_snps = gene_snp_pairs.SNP.tolist()

In [29]:
gene_snp_pairs.to_csv('toy_gene_snp_pairs.csv')

### Make toy genotypes and covariates

In [33]:
pop = 'eur'
snps_path = data_path + 'mateqtl_input/{}_genos.tsv'.format(pop)
cov_path = data_path + 'mateqtl_input/{}_mateqtl_cov.txt'.format(pop)

In [34]:
snps = pd.read_csv(snps_path, sep='\t', index_col=0).T
cov = pd.read_csv(cov_path, sep='\t', index_col=0).T

In [35]:
# Print the first 5 SNPs for the first 5 individuals to show the structure
snps.loc[:, selected_snps].head(10).to_csv('toy_genotypes.csv')

In [36]:
cov.head(10).to_csv('toy_covariates.csv')

### Read h5ad object

Standard h5ad file in the scanpy workflow. Some things to keep in mind:

- `adata.X` should be the raw counts with all genes detected. Typically, this will be the size of N cells with ~30k genes in a standard 10X experiment. 
- Here, we will just use the T4 cells defined by one of the AnnData.obs columns.


In [37]:
ct = 'T4'

In [40]:
adata = sc.read(data_path + 'single_cell/{}_{}.h5ad'.format(pop, ct))
adata = adata[adata.obs.ind_cov.isin(cov.head(10).index)].copy() # pick out individuals we have genotype and covariates for




In [41]:
print('We have {} cells labeled as T4'.format(adata.shape[0]))

We have 16907 cells labeled as T4


In [43]:
adata.write('toy_adata.h5ad')

In [22]:
# adata.X should be a sparse matrix with counts
print('Confirming that adata.X is a sparse matrix of counts.')
print('Row sums are:')
print(adata.X.sum(axis=1)[:5])
print('')
print('The matrix itself:')
adata.X

Confirming that adata.X is a sparse matrix of counts.
Row sums are:
[[1905.]
 [2104.]
 [2102.]
 [1209.]
 [2030.]]

The matrix itself:


<129531x32738 sparse matrix of type '<class 'numpy.float32'>'
	with 83322139 stored elements in Compressed Sparse Row format>