# Library design for SI-14

by Pu Zheng and Jun-Han Su

This library may be ordered from TWIST

In [1]:
#minimum imports:
import time,os,sys,glob
import cPickle as pickle
import numpy as np
import khmer
sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')

from LibraryConstruction import fastaread,fastawrite,fastacombine
import LibraryDesigner as ld
import LibraryConstruction as lc

## 0.1 Indexing for human genome hg38
(skip step 0 if indeces have been created)

In [6]:

#Only do it once!
#This example is for the human genome hg38

#Construct whole genome hash table / similarly for transcriptome file
#Note: This is NOT degenerate for reverse-complement and it only maps the + strand.

#minimum imports:
#import khmer,sys,glob
#sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')
#from LibraryConstruction import fastaread,fastawrite

#khmer's hash tables are very easy to understand at least with single threads (no parallel computation)
#Simply, each hash(sequence) = number in base 4. Z(num_table) prime numbers < a big number are specified 
#and the remainded of the hash is used to adress the Z tables. This allows for an easy bloom filter for finding
#missing kmers. Considering its simplicity for single thread there should be a Windows version!
#http://journals.plos.org/plosone/article?id=10.1371/journal.pone.0101271

ksize = 17 #word size
kmer = khmer.Countgraph(ksize, 2e9, 4) #hash total table size 2e9*4. 4 is the number of tables (prime numbers) to use
kmer.set_use_bigcount(True) #This allows 2bytes for each count. Thus the maximum count is 2**16-1
#The total RAM required for this is thus 2e9*4*2*1.2 bytes = 19.2 gigabytes. 
#~20% is used for hash specific elements as documented by khmer.
#The size of the hash table is allocated at the beginning. 
#Note: if the size is too small there will be many collisions in the hash resulting in 'overinflated' counts.
#Note: unknown characters are mapped to A. (N->A)
#Note: the sequences need to be capitalized. (c->A, but C->C)

fa_fls = glob.glob(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/*.fa') #permanant dir
print 'Number of fasta files: '+str(len(fa_fls));
save_file = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/full_word'+str(ksize)+'_.kmer' #permanant dir

for fa_fl in fa_fls:
    print "Dealing with file:"+fa_fl
    nms,seqs = fastaread(fa_fl)
    for seq in seqs:
        kmer.consume(seq.upper())
kmer.save(save_file)
os.path.getsize(save_file)

Number of fasta files: 455
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_JH159147v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_GL383565v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270539v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr19_KI270889v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr19_KI270887v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr22_KI270876v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270510v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_KI270859v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270755v1.fa
Dealing with file:/n/boslfs/LAB

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr16_KI270854v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_KI270874v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_KI270772v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr16_KI270728v1_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr9_GL383542v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr1_KI270766v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270333v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr11_JH159136v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270310v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzhen

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270584v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr6_KB021644v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_KI270872v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270749v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr12_GL877876v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr5_KI270796v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr22_KI270734v1_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270467v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr1_KI270711v1_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270468v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_KI270894v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr4_KI270896v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270317v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr18_KI270864v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr15_KI270851v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr6_GL000253v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr19.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr3_KI270937v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr15_KI270905v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_GL000205v2_random.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr4_GL383527v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr17_KI270862v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270466v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_GL000216v2.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr9_GL383541v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr13_KI270841v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzhen

Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrY.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr7_KI270899v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr2_GL582966v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270425v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr21_KI270873v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr12_KI270836v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr6_GL000256v2_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chr11_KI270826v1_alt.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms/chrUn_KI270373v1.fa
Dealing with file:/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg3

8002722906

## 0.2 Indexing for human mRNA

In [5]:
# Indexing for human mRNA
ksize = 17 #word size
mRNA_kmer = khmer.Countgraph(ksize, 2e9, 4) #hash total table size 2e9*4. 4 is the number of tables (prime numbers) to use
mRNA_kmer.set_use_bigcount(True) #This allows 2bytes for each count. Thus the maximum count is 2**16-1
#The total RAM required for this is thus 2e9*4*2*1.2 bytes = 19.2 gigabytes. 
#~20% is used for hash specific elements as documented by khmer.
#The size of the hash table is allocated at the beginning. 
#Note: if the size is too small there will be many collisions in the hash resulting in 'overinflated' counts.
#Note: unknown characters are mapped to A. (N->A)
#Note: the sequences need to be capitalized. (c->A, but C->C)

mRNA_fa_fls = glob.glob(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/mRNA/*.fa') #permanant dir
print 'Number of fasta files: '+str(len(mRNA_fa_fls));
mRNA_save_file = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/mRNA'+str(ksize)+'_.kmer' #permanant dir

for fa_fl in mRNA_fa_fls:
    print "Dealing with mRNA file:"+fa_fl
    nms,seqs = fastaread(fa_fl)
    for seq in seqs:
        if len(seq) <= ksize:
            continue;
        mRNA_kmer.consume(seq.upper())
#mRNA_kmer.save(mRNA_save_file)
os.path.getsize(mRNA_save_file)

Number of fasta files: 0


OSError: [Errno 2] No such file or directory: '/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38/mRNA17_.kmer'

## 1 Extract region sequences

    CDH1 chr16: 68737225-68835548
    CDH2


In [2]:
## Some folders
# human genome
genome_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms';

In [3]:
def Batch_Extract_Sequences(master_folder, reg_filename =r'Regions.txt', \
                      genome_folder= r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Genomes/human/hg38/chroms',\
                      resolution=10000):
    '''Function to extract sequences for all regions written in a file'''
    if not isinstance(master_folder, str) and not isinstance(reg_filename, str) and not isinstance(genome_folder, str):
        raise ValueError('wrong input format!');
    reg_file = open(master_folder + os.sep + reg_filename, 'r');
    print 'Input region file is: '+reg_file;
    


In [4]:
Batch_Extract_Sequences(r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT')

IOError: [Errno 2] No such file or directory: '/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT/Regions.txt'

ls: cannot access /n/boslfs/L: No such file or directory


In [13]:
!ls /n/home13/pzheng/

Desktop  Documents  Softwares


In [15]:
!ls /n/boslfs/LABS/

ls: cannot access /n/boslfs/LABS: No such file or directory


### EMT

True

In [29]:
master_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
# where to save
seq_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/region_seqs/EMT';
# EMT region1 info
chrom = 16;
start = 68737225;
stop = 68835548;
# mkdir if not exist
if not os.path.exists(seq_folder):
    os.makedirs(seq_folder)
_, wholechr = ld.fastaread(genome_folder+os.sep+'chr'+str(chrom)+'.fa')
wholechr = wholechr[0]
# Design of each region
res = 10000; # 10kb resolution
n_reg = int((stop-start)/res)+1; # number of regions
# extract all DNA sequences for these regions
whole_seq = wholechr[start:start+n_reg*res];
for i in range(n_reg):
    # sequence for this region
    reg_seq = whole_seq[i*res:(i+1)*res];
    REG_SEQ = [reg_seq.upper()];

    name = ['chr'+str(chrom)+':'+str(start+i*res)+'-'+str(start+(i+1)*res)+'_reg_'+str(i+1)];
    filename = seq_folder + os.sep + 'reg_' + str(i+1)+'.fasta';
    ld.fastawrite(filename, name, REG_SEQ);
    


In [None]:
#Write regions in folder for Jun-Han's favourite region chr4:52833833-52833833+1700000(hg38)  chr4:53,700,000-55,400,000


folder = r'/n/dulacfs2/Users/bbintu/Libraries/SI13/DNAseqs2'
if not os.path.exists(folder):
    os.makedirs(folder)

_,wholechr = ld.fastaread(r'/n/dulacfs2/Users/bbintu/Genomes/human/hg38/chr4.fa')
wholechr = wholechr[0]

nf = 170# number of regions
res = 10000 # 30 kb resolution
start = 52833833 #wheret to start in chr4
DNA_ = wholechr[start:start+nf*res]
for i in range(nf):
    seq = DNA_[i*res:(i+1)*res]
    seqs = [seq.upper()]
    names = ['chr21:'+str(start+i*res)+'-'+str(start+(i+1)*res)+'reg_'+str(i+1)]
    file_name = folder+os.sep+'reg_'+str(i+1)+'.fasta'
    ld.fastawrite(file_name, names, seqs)

In [2]:
SI7_folder = r'/n/home13/pzheng/Documents/ForJunHan_Pu/SI7_30kb_extended/'
SI7_csv_list = glob.glob(SI7_folder + '*.csv')
SI7n_csv_list, SI7p_csv_list = [], []; # Initializing
for SI7_csv in SI7_csv_list: # Split list into original SI7 (SI7p) and extended SI7(SI7n)
    if '-' in SI7_csv:
        SI7n_csv_list.append(SI7_csv);
    else:
        SI7p_csv_list.append(SI7_csv);

In [3]:
len(SI7p_csv_list),len(SI7n_csv_list)

(140, 140)

In [4]:
PDGFR_folder = r'/n/home13/pzheng/Documents/ForJunHan_Pu/PDGFR_10kb/'
PDGFR_csv_list = glob.glob(PDGFR_folder + '*.csv')

In [5]:
from Bio.Seq import Seq
myseq = Seq('AtGC')

In [6]:
myseq.tostring()



'AtGC'

In [7]:
from Bio import SeqIO

In [8]:
import csv
def CSV_Probe_Reader(filename, fmt='probe'):
    with open(filename, 'r') as _csvfile:
        # reader
        _csvreader = csv.reader(_csvfile, delimiter=',') # reader
        # check if has header
        _has_header = csv.Sniffer().has_header(_csvfile.readline())
        _csvfile.seek(0)  # Rewind.
        if _has_header:
            next(_csvreader)  # Skip header row.
        for row in _csvreader:
            pass
        return _csvreader
test = CSV_Probe_Reader(SI7n_csv_list[0])

In [9]:
def CSV_Merge(filelist, output_name, header=False):
    with open(output_name, 'wb') as ff:
        writer = csv.writer(ff,delimiter=',')
        #writer.writerows(csv.reader(open(SI7p_csv_list[0],'r')))
        for filename in filelist:
            with open(filename, 'r') as _csvfile:
                # reader
                _csvreader = csv.reader(_csvfile, delimiter=',') # reader
                if not header:
                    # check if has header
                    _has_header = csv.Sniffer().has_header(_csvfile.readline())
                    _csvfile.seek(0)  # Rewind.
                    if _has_header:
                        next(_csvreader)  # Skip header row.
                writer.writerows(_csvreader);
CSV_Merge(SI7p_csv_list, 'SI7p_merged.csv')
CSV_Merge(SI7n_csv_list, 'SI7n_merged.csv')
CSV_Merge(PDGFR_csv_list, 'PDGFR_merged.csv')

In [77]:
class probe():
    

In [41]:
!ls /n/home13/pzheng/Documents/ForJunHan_Pu/SI7_30kb_extended/pb_reports_reg_79.csv

/n/home13/pzheng/Documents/ForJunHan_Pu/SI7_30kb_extended/pb_reports_reg_79.csv


In [None]:
!ls /n/boslfs/

In [None]:
##Construct SI14 probes-EMT
import time,os,sys
reload(ld)
import matplotlib.pyplot as plt
plt.switch_backend('agg')

index_folder = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Indeces/human/hg38';

genmoe_index = index_folder + os.sep + r'full_word17_.kmer';
repeat_index = index_folder + os.sep + r'HumanRepeats_word17_.kmer';
transcriptome_index = index_folder + os.sep + r'transcriptome_word17_.kmer';

master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';

resolution = 20000;
seq_dir = os.sep + 'region_seqs-'+str(resolution)
save_dir = os.sep + 'reports-'+str(resolution)

merged = True;
if merged:
    seq_dir = seq_dir + os.sep + 'merged';
    save_dir = save_dir + os.sep + 'merged';


# get input and output 
input_files = glob.glob(master_dir+seq_dir+os.sep+r'*.fasta')
save_folder = master_dir + save_dir;

if not os.path.exists(save_folder):
    os.makedirs(save_folder);
# Loop through all input files
for in_file in input_files:
    #print in_file
    save_file = save_folder+os.sep+os.path.basename(in_file).replace('.fasta','.pbr')#'/pb_reports_reg_'+str(-i-1)+'.pbr'
    if not os.path.exists(save_file.replace('.pbr','.png')):
        
        local_genome_fl = in_file
        print in_file;
        pb_designer = ld.pb_reports_class(
            sequence_dic={'file':in_file,'use_revc':False,'use_kmer':True},
            map_dic={'genome':{'file':genmoe_index,'use_revc':True,'use_kmer':True},
                  'rep_genome':{'file':repeat_index,'use_revc':True,'use_kmer':True},
                  'local_genome':{'file':local_genome_fl,'force_list':True,'use_revc':True,'use_kmer':True}},
            save_file=save_file,
            params_dic={'word_size':17,'pb_len':42,'buffer_len':2,'max_count':2**16-1,'check_on_go':False,'auto':False},
            dic_check={('genome','local_genome'):75,'rep_genome':0,'gc':[0.25,0.85],'tm':70,'masks':['AAAAA','TTTTT','GGGGG','CCCCC']})

        pb_designer.computeOTmaps()
        pb_designer.compute_pb_report()
        pb_designer.perform_check_end()
        pb_designer.plots()
        print len(pb_designer.pb_reports_keep)

In [None]:
prob_lens = [];
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT';
save_folder = master_dir + save_dir;
#to continue or check sequences:
reload(ld)

pbde = ld.pb_reports_class()
files = glob.glob(save_folder+os.sep+r'*.pbr')
for file_ in files:
    pbde.load_pbr(file_)
    pbde.dic_check={'gc': [0.25, 0.85], 'rep_genome': 0, 'tm': 70, ('genome', 'local_genome'): 75,
                    'masks':['AAAAA','TTTTT','GGGGG','CCCCC']}
    pbde.perform_check_end()
    print file_
    print len(pbde.pb_reports_keep)
    prob_lens.append(len(pbde.pb_reports_keep))
    pbde.save_csv()
    #pbde.plots()

In [None]:
%matplotlib notebook

plt.figure()
plt.hist(prob_lens,20)
plt.show()