# X-inactivation analysis

Using differential variance to detect X-inactivation

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy.api as sc
import scipy as sp
import itertools
import numpy as np
import scipy.stats as stats
from scipy.integrate import dblquad
import seaborn as sns
import imp
import time

  from ._conv import register_converters as _register_converters


In [2]:
import sys
sys.path.append('/Users/mincheolkim/Github/scrna-parameter-estimation/simplesc')
import simplesc

In [3]:
data_path = '/Users/mincheolkim/Google Drive/UCSF/research/parameter_estimation/x_inactivation_data/'

### Read in the data file

In [4]:
adata = sc.read(data_path + 'SLEcrossX_nonorm.h5ad')

In [26]:
dict(adata.obs.ct_cov.value_counts())

{'CD4 T cells': 173441,
 'CD14+ Monocytes': 149311,
 'CD8 T cells': 73706,
 'B cells': 71755,
 'NK cells': 49650,
 'FCGR3A+ Monocytes': 27839,
 'Megakaryocytes': 14001,
 'Dendritic cells': 6750}

In [5]:
adata.obs.head(5)

Unnamed: 0_level_0,batch,batch_cov,ct_cov,disease_cov,ind_cov,pop_cov,well
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
AAACCTGAGCAATATG-1-0-0-0-0-0-0-0-0-0,0,lupus8.16,Megakaryocytes,sle,901560200_901560200,WHITE,YE_8-16-1
AAACCTGAGTGCAAGC-1-0-0-0-0-0-0-0-0-0,0,lupus8.16,CD8 T cells,sle,1597_1597,ASIAN,YE_8-16-1
AAACCTGCAAGGCTCC-1-0-0-0-0-0-0-0-0-0,0,lupus8.16,CD14+ Monocytes,sle,1775_1775,WHITE,YE_8-16-1
AAACCTGCACGACTCG-1-0-0-0-0-0-0-0-0-0,0,lupus8.16,NK cells,sle,1760_1760,WHITE,YE_8-16-1
AAACCTGCAGGAATCG-1-0-0-0-0-0-0-0-0-0,0,lupus8.16,CD14+ Monocytes,sle,900759200_900759200,WHITE,YE_8-16-1


In [21]:
adata.obs.ct_cov.value_counts()

CD4 T cells          173441
CD14+ Monocytes      149311
CD8 T cells           73706
B cells               71755
NK cells              49650
FCGR3A+ Monocytes     27839
Megakaryocytes        14001
Dendritic cells        6750
Name: ct_cov, dtype: int64

In [6]:
adata.shape

(566453, 32738)

### Filter gene list

In [7]:
sc.pp.filter_genes(adata, min_cells=5000)

In [8]:
adata.obs['n_counts'] = adata.X.sum(axis=1).A1
adata = adata[(adata.obs['n_counts'] > 200), :].copy()

### Fit simplesc

In [19]:
imp.reload(simplesc)

<module 'simplesc' from '/Users/mincheolkim/Github/scrna-parameter-estimation/simplesc/simplesc.py'>

In [20]:
estimator = simplesc.SingleCellEstimator(adata, p=0.1, group_label='ct_cov')

In [22]:
estimator.compute_observed_statistics(group='CD4 T cells')
estimator.compute_observed_statistics(group='CD14+ Monocytes')

KeyboardInterrupt: 

In [63]:
names = adata.obs.ind_cov.value_counts().index.tolist()
real_ids = []
for name in names:
    if '_' in name and len(name) == 9:
        real_ids.append(int(name.split('_')[0]))

In [54]:
metadata = pd.read_csv(
    '/Users/mincheolkim/Google Drive/UCSF/research/parameter_estimation/misc/cluestime1.csv', sep='\t')

In [56]:
metadata.query('subjectid == 1760')

Unnamed: 0,subjectid,female,genderident,age,sledxyr,slesxdx,dxdrspecialty,dxdroth_spec,specialisttime,sledrnow,...,peskinoth,peskinothdesc,peedema,peedemadesc,pelowerextoth,pelowerextothdesc,penofindings,pecomments,pemuscoth,followupok


In [47]:
all_people = metadata.subjectid.value_counts().index.tolist()

In [64]:
metadata[metadata.subjectid.isin(real_ids)]

Unnamed: 0,subjectid,female,genderident,age,sledxyr,slesxdx,dxdrspecialty,dxdroth_spec,specialisttime,sledrnow,...,peskinoth,peskinothdesc,peedema,peedemadesc,pelowerextoth,pelowerextothdesc,penofindings,pecomments,pemuscoth,followupok
