In [1]:
import dask.array as da
import zarr
import pandas as pd
import numpy as np
import allel
import sgkit as sg
import pickle as pkl
import dask.dataframe as dd

In [3]:
#open and mark as TRUE / accessible if fewer than if at most 0.2% of inds have low pp
with open('data/pp_summary_stats.pkl', 'rb') as f:
    pp_bool = pkl.load(f) <= 0.998

    #open and mark as TRUE / accessible if fewer than if at most 10% of inds have low mapq
with open('data/mapq_summary_stats.pkl', 'rb') as f:
    z= pkl.load(f)
mq_bool = z <= 0.9

#open and mark as TRUE if
with open('data/stash/coverage_summary_stats.pkl', 'rb') as f:
    cov = pkl.load(f) 

#zero_cov_bool = cov[0] <=  0.998 #accessible/TRUE if at most 0.002% of inds have no coverage at a position
#low_cov_bool = cov[1] <= 0.9 #accessible/TRUE if at most 10% of inds have low coverage at a position
#hi_cov_bool = cov[2] <=  0.98 #ccessible/TRUE if at most 2% of inds have high coverage at a position

#total_mask = np.logical_and(np.logical_and(pp_bool, mq_bool, zero_cov_bool), low_cov_bool, hi_cov_bool)

#bed = np.stack([chrom,pos,total_mask])
#bed.tofile('mask.bed', sep='\t')

In [8]:
with open('data/modes.pkl', 'rb') as f:
    modes = pkl.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'data/modes.pkl'

In [5]:
np.sum(total_mask) / total_mask.shape[0] * 100

82.3802828333847

So our preliminary mask has classified 82% of the genome as accessible based on coverage, mapping quality filters. Now we need to add repetitive element data. VectorBase has two tracks for repetitive elements - one for simple / tandem repeats and another for low complexity regions. 

In [49]:
!bedtools genomecov -g ~/lstm_data/cease/genomes/anstep/VectorBase-61_AstephensiUCISS2018_Genome.fasta.fai -d -i data/TandemRepeats-AllChrs.bed > TandemRepeatGenomeCov.txt &
!bedtools genomecov -g ~/lstm_data/cease/genomes/anstep/VectorBase-61_AstephensiUCISS2018_Genome.fasta.fai -d -i data/LowComplexityRegions-AllChrs.bed > LowComplexityGenomeCov.txt &

OSError: Background processes not supported.

Load the output of each, count how many BP they cover individually and together, then add to the total_mask, and recount how many base pairs

In [6]:
import dask
dask.config.set(num_workers=6) 

def get_bed_bool(file):
    ddf = dd.read_csv(file, sep='\t', header=None)
    is_covered = ddf[2].values.flatten()
    is_acc = is_covered == 0
    acc_inmem = is_acc.compute()
    return(acc_inmem)

In [7]:
#get data
tr_isacc = get_bed_bool('TandemRepeatGenomeCov.txt')
lc_isacc = get_bed_bool('LowComplexityGenomeCov.txt')

#combine with metric-based mask
mask_inctr = np.logical_and(total_mask, lc_isacc, tr_isacc)

In [8]:
#now let's get chrom and pos data
ddf = dd.read_csv('TandemRepeatGenomeCov.txt', sep='\t', header=None)
chrompos = ddf.iloc[:, :2]
chrompos = chrompos.compute()

In [9]:
#combine with the mask to make chrom/pos/is_acc
chrompos['is_acc'] = mask_inctr
chrompos.columns = ['chrom', 'pos', 'is_acc'] #set colnames

In [15]:
#make windowed mean accssibility across the genome for cnv calling

#script to make a 2d array of windows for analysis
def create_windows(chrom_length, window_size=300):
    # Calculate the number of windows
    num_windows = int(np.ceil(chrom_length / window_size))
    
    # Create an empty list to store the windows
    windows = []
    
    # Generate windows
    for i in range(num_windows):
        start = i * window_size 
        stop = start + window_size  
        # Make sure the stop position does not exceed chromosome length
        if stop > chrom_length:
            stop = chrom_length
        windows.append([start, stop])
    
    return np.array(windows)

chr = 'CM023248'
subset = chrompos[chrompos['chrom'] == chr]
pos = allel.SortedIndex(subset['pos'])
chrom_length = pos.shape[0] 
windows_array = create_windows(chrom_length)

In [17]:
windows_array.shape

(312354, 2)

In [None]:
#make windowed mean accssibility across the genome for cnv calling

#script to make a 2d array of windows for analysis
def create_windows(chrom_length, window_size=300):
    # Calculate the number of windows
    num_windows = int(np.ceil(chrom_length / window_size))
    
    # Create an empty list to store the windows
    windows = []
    
    # Generate windows
    for i in range(num_windows):
        start = i * window_size 
        stop = start + window_size  
        # Make sure the stop position does not exceed chromosome length
        if stop > chrom_length:
            stop = chrom_length
        windows.append([start, stop])
    
    return np.array(windows)

#function to get mean accessibility (always divide by winsize to account for missing sites in the genome)
def getmeanacc(win, winsize=300):
    meanacc = float(np.sum(win))/winsize
    return(meanacc)

#make dict of chroms and lengths
chroms = ['CM023248', 'CM023249', 'CM023250']
chromlens=[93706023, 88747589, 22713616]
chromdict = dict(zip(chroms, chromlens))

#for each chromosome, calculate mean acc in windows of 300bp
arrlist = []
winsize = 300
for chr in chromdict.keys():
    subset = chrompos[chrompos['chrom'] == chr]
    pos = allel.SortedIndex(subset['pos'])
    windows = create_windows(chromdict[chr], winsize)
    meanacc = allel.windowed_statistic(values=np.array(subset['is_acc']), pos = pos, statistic = getmeanacc, windows=windows)
    arrlist.append(meanacc)

NameError: name 'position' is not defined

In [175]:
arrlist[0][2].shape

(312354,)

In [180]:
dflist = []
for i, chr in enumerate(chromdict):
    df = pd.DataFrame({
        'Chrom' : chr,
        'Position' :  arrlist[i][1][:,0],
        'Mean_accessibility' : arrlist[i][0],
    })
    dflist.append(df)
accdf = pd.concat(dflist)

In [189]:
accdf.to_csv('/home/dennist/lstm_data/cease/cnv_calling/data/mean_accessibility_step.txt', sep='\t', index=False)

In [78]:
#dump to zarr
# Group by 'chrom' and save to Zarr
grouped = chrompos.groupby('chrom')

# Create a Zarr store
store = zarr.DirectoryStore('accessibility_mask.zarr')
# Create a group in Zarr for each chromosome
root = zarr.group(store)
for chrom, group_df in grouped:
    # Convert boolean column to integer (True -> 1, False -> 0)
    group_df['is_acc'] = group_df['is_acc'].astype(int)
    
    # Convert DataFrame to Zarr array
    group_array = zarr.array(group_df[['pos', 'is_acc']].to_numpy(), chunks=(len(group_df), 2), dtype=bool)
    
    # Save Zarr array under 'chrom' group
    root.create_group(chrom)
    root[chrom].create_dataset('pos_is_acc', data=group_array)

# Optionally, close the store
store.close()

KeyboardInterrupt: 

In [42]:
#dump ACCESSIBLE sites to txt file for variant filtration
acc_pos = chrompos[chrompos['is_acc'] == True]
acc_pos[['chrom', 'pos']].to_csv('accessible_positions.txt', sep='\t', header=False, index=False)


In [71]:
chrompos[['chrom', 'pos', 'is_acc']].to_csv('/home/dennist/lstm_data/cease/analysis/accessibility_mask/all_positions_mask.txt', sep='\t', header=False, index=False)


In [63]:
import allel
import plotly.express as px
def plot_moving_chrom_stat(arr, chrom, winsize):
    posarr = chrompos[chrompos['chrom'] == chrom]['pos']
    acc_moving = allel.moving_statistic(arr, statistic=np.mean, size=winsize)
    pos = allel.moving_statistic(posarr, statistic=np.mean, size=winsize)
    fig = px.line(x=pos,y=acc_moving)
    fig.show()




In [65]:
chrom='CM023248'
chromposbool = chrompos['chrom'] == chrom
pp_arr = pp_bool[chromposbool]
plot_moving_chrom_stat(pp_arr, 'CM023248', winsize=1000)

In [67]:
chrom='CM023248'
chromposbool = chrompos['chrom'] == chrom
pp_arr = mq_bool[chromposbool]
plot_moving_chrom_stat(pp_arr, 'CM023248', winsize=10000)

In [3]:
def set_acc_mask(chr):
    print('reading acmask')
    chrom = pd.read_table(f'/home/dennist/lstm_data/cease/analysis/accessibility_mask/{chr}.allpositions.accmask.txt')
    print(f'loaded mask for {chr}')
    chrom.columns = ['chrom', 'pos', 'is_acc']
    #let's get an estimate of how many sites in our callset are accessible
    #chr_sub = chrom[chrom['chrom'] == 'CM023248']
    #print(f'loading /home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.phased.{chr}.zarr/')
    ds = sg.load_dataset(f'/home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.{chr}.zarr/')
    pos = ds['variant_position'].compute()
    pos = pd.DataFrame(pos)
    pos['index'] = pos[0]
    pos = pos.set_index('index')
    chrom.set_index('pos', inplace=True)
    locpos = pos.join(chrom)
    print(f'joined mask to snp data for {chr}')
    accdask = da.from_array(np.array(locpos['is_acc']), chunks=(10000,))
    #save acc mask as var
    #ds = ds.drop_vars('is_accessible')
    ds['is_accessible'] = (('variants',), accdask)
    print(f'saving mask to /home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.{chr}.zarr/')

    ds.to_zarr(f'/home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.{chr}.zarr/', mode='a',  encoding={'is_accessible': {'chunks': (10000,), 'dtype': 'bool'}})  # Use 'a' mode to append data to the existing file


In [4]:
for c in ['CM023248', 'CM023249', 'CM023250']:
    set_acc_mask(c)

reading acmask


loaded mask for CM023248
joined mask to snp data for CM023248
saving mask to /home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.CM023248.zarr/
reading acmask
loaded mask for CM023249
joined mask to snp data for CM023249
saving mask to /home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.CM023249.zarr/
reading acmask
loaded mask for CM023250
joined mask to snp data for CM023250
saving mask to /home/dennist/lstm_data/cease/variants_bycohort/combined_cohorts/zarr/combined_cohorts.CM023250.zarr/


In [2]:
chrompos=zarr.open('accessibility_mask.zarr')

In [66]:
acc = acc[:]

In [67]:
acc

array([1, 1, 1, ..., 1, 1, 1], dtype=int8)

In [24]:
winpos = allel.moving_statistic(pos, np.mean, 300)

In [49]:
winacc_int.shape

(29706,)

In [51]:
meanacc = allel.windowed_statistic(pos, acc_int, np.mean, 300)

KeyboardInterrupt: 

In [29]:
meanacc[0]

array([ 1.,  1.,  1., ..., nan, nan,  1.])

In [63]:
acc[:]

array([1, 1, 1, ..., 1, 1, 1], dtype=int8)

In [58]:
acc_int = acc_int[:]

In [None]:
meanacc = allel.windowed_statistic(pos, acc_int, np.mean, windows=windows_array)

In [56]:
meanacc

(array([1.        , 1.        , 1.        , ..., 0.80487805, 0.96153846,
        1.        ]),
 array([[      1,     301],
        [    301,     601],
        [    601,     901],
        ...,
        [8911201, 8911501],
        [8911501, 8911801],
        [8911801, 8912050]]),
 array([ 9, 18, 16, ..., 41, 26, 26]))

In [55]:
import plotly.express as px
px.scatter(x=meanacc[1][:,1],y=meanacc[0])