# SI-14 Single Molecule RNA FISH 
2018.4.2

In [11]:
#minimum imports:
import time,os,sys,glob
import cPickle as pickle
import numpy as np
import khmer
sys.path.append(r'/n/home13/pzheng/Documents/python-functions/python-functions-library')

from LibraryConstruction import fastaread,fastawrite,fastacombine
import LibraryDesigner as ld
import LibraryConstruction as lc

from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from Bio.SeqRecord import SeqRecord 

import csv
import io

## 1. Import data

In [72]:
master_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Libraries/SI-14/EMT_smFISH';
# input filename
input_filename = 'EMT_smFISH.txt';
dic_list = []
with open(master_dir+os.sep+input_filename, 'rU') as handle:
    headers = handle.readline().split("\n")[0].split("\t")
    for line in handle.readlines():
        _dic = {}
        for header,info in zip(headers,line.split("\n")[0].split("\t")):
            _dic[header] = info
        dic_list.append(_dic)

## 2. Design barcode scheme

In [73]:
# Get list of all genes
genes = list(np.unique([v['Gene'] for v in dic_list]))
barcode_scheme = {};
for i,gene in enumerate(sorted(genes)):
    barcode_scheme[gene] = {'bc_stv': i,
                            'bc_ndb': i}

## 3. Patch barcodes

### 3.1 import barcodes

In [74]:
## Read all barcodes
barcode_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Barcodes';

# read all Stv barcodes
#stv_adaptor = [1,2,17,77,78,79,80,81,82,83,84] # barcodes saved for adaptors
#stv_bad = [34,38,41] # barcodes performed badly
#stv_mask = stv_adaptor + stv_bad 
stv_mask =[]
with open(barcode_dir+os.sep+'top_Stvs.fasta', "rU") as handle:
    stv_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in stv_mask:
            stv_barcodes.append(record);
            
# read all NDB barcodes
ndb_mask = [];

with open(barcode_dir+os.sep+'NDBs.fasta', "rU") as handle:
    ndb_barcodes = [];
    for record in SeqIO.parse(handle, "fasta"):
        if int(record.id.split('_')[1]) not in ndb_mask:
            ndb_barcodes.append(record);
print "Barcodes loaded: Stv: "+str(len(stv_barcodes))+", NDB: "+str(len(ndb_barcodes));

Barcodes loaded: Stv: 75, NDB: 1052


### 3.2 import primers

In [75]:
## Read all primers
primer_dir = r'/n/boslfs/LABS/zhuang_lab/User/pzheng/Primers';
fwd_primer_filename = 'forward_primers_keep.fasta';
rev_primer_filename = 'reverse_primers_keep.fasta';

# read all forward primers
with open(primer_dir+os.sep+fwd_primer_filename, "rU") as handle:
    fwd_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        fwd_primers.append(record);
# read all forward primers
with open(primer_dir+os.sep+rev_primer_filename, "rU") as handle:
    rev_primers = [];
    for record in SeqIO.parse(handle, "fasta"):
        rev_primers.append(record);
print "Primers loaded: forward: "+str(len(fwd_primers))+", reverse: "+str(len(rev_primers)); 

# primers
fprimer = fwd_primers[2];
print '- forward primer:', fprimer
rprimer = rev_primers[1];
print '- reverse primer:', rprimer

Primers loaded: forward: 11, reverse: 6
- forward primer: ID: W1A05_primer_4
Name: W1A05_primer_4
Description: W1A05_primer_4
Number of features: 0
Seq('CATTCAGCATTGCGCAACGG', SingleLetterAlphabet())
- reverse primer: ID: W1A04_primer_3
Name: W1A04_primer_3
Description: W1A04_primer_3
Number of features: 0
Seq('TAATACGACTCACTATAGGGCGTTGTATGCCCTCCACGC', SingleLetterAlphabet())


In [84]:
## Parameters used for patch barcodes & primers
# barcodes
barcode_source = {'bc_stv':'stv',
                  'bc_ndb':'ndb'};
barcode_starts = {'stv':1, 'ndb':301};

_stv_barcodes, _ndb_barcodes = [],[];
for record in stv_barcodes:
    if not int(record.id.split('_')[1]) < barcode_starts['stv']:
        _stv_barcodes.append(record)
for record in ndb_barcodes:
    if not int(record.id.split('_')[1]) < barcode_starts['ndb']:
        _ndb_barcodes.append(record)

barcode_len = 20

### 3.3 start patching

In [89]:
for i,dic in enumerate(dic_list):
    dic['bc_stv'] = _stv_barcodes[barcode_scheme[dic['Gene']]['bc_stv']]
    dic['bc_ndb'] = _ndb_barcodes[barcode_scheme[dic['Gene']]['bc_ndb']]
    total_seq_list = [fprimer.seq, \
                      dic['bc_stv'].seq[-barcode_len:].reverse_complement(),\
                      dic['bc_ndb'].seq[-barcode_len:].reverse_complement(),\
                      Seq(dic['Target']),\
                      dic['bc_stv'].seq[-barcode_len:].reverse_complement(),\
                      dic['bc_ndb'].seq[-barcode_len:].reverse_complement(),\
                      rprimer.seq[-20:].reverse_complement()]
    total_seq = Seq('');
    for s in total_seq_list:
        total_seq += s
    dic['total_seq'] = total_seq;
    name_list = ['']

In [104]:
barcode_scheme

{'AlphaCat': {'bc_ndb': 0, 'bc_stv': 0},
 'AlphaVIntegrin': {'bc_ndb': 1, 'bc_stv': 1},
 'BLACAT1': {'bc_ndb': 2, 'bc_stv': 2},
 'BetaCAt': {'bc_ndb': 3, 'bc_stv': 3},
 'CD133': {'bc_ndb': 4, 'bc_stv': 4},
 'CD24': {'bc_ndb': 5, 'bc_stv': 5},
 'CD44': {'bc_ndb': 6, 'bc_stv': 6},
 'CDH1': {'bc_ndb': 7, 'bc_stv': 7},
 'CDH2': {'bc_ndb': 8, 'bc_stv': 8},
 'CDH3': {'bc_ndb': 9, 'bc_stv': 9},
 'DDR2': {'bc_ndb': 10, 'bc_stv': 10},
 'ERK2': {'bc_ndb': 11, 'bc_stv': 11},
 'ESRP1': {'bc_ndb': 12, 'bc_stv': 12},
 'ESRP2': {'bc_ndb': 13, 'bc_stv': 13},
 'FN1': {'bc_ndb': 14, 'bc_stv': 14},
 'GRHL2': {'bc_ndb': 15, 'bc_stv': 15},
 'HOTAIR': {'bc_ndb': 16, 'bc_stv': 16},
 'KRT14': {'bc_ndb': 17, 'bc_stv': 17},
 'KRT18': {'bc_ndb': 18, 'bc_stv': 18},
 'MALAT1': {'bc_ndb': 19, 'bc_stv': 19},
 'MMP2': {'bc_ndb': 20, 'bc_stv': 20},
 'MMP9': {'bc_ndb': 21, 'bc_stv': 21},
 'NEAT1': {'bc_ndb': 22, 'bc_stv': 22},
 'OCLN': {'bc_ndb': 23, 'bc_stv': 23},
 'OVOL2': {'bc_ndb': 24, 'bc_stv': 24},
 'RBFOX2': {'b

In [103]:
fprimer.id.split('_')[-1]

'4'

In [25]:
## Create pb_record and pb_list by pb_file
pb_records, reg_pb_dic = [],{}; # initialize
# loop through all designed probes
with open(master_dir+os.sep+pb_filename, 'rU') as handle:
    lines = handle.readlines()
    titles = lines[0].split("\n")[0].split("\t")
    for line in lines[1:]:
        seq, name = line.split("\n")[0].split("  ");
        pb_records.append(SeqRecord(Seq(seq.upper(),alphabet=IUPAC.unambiguous_dna),id=name, name=name,description=''))
        reg_id = int(name.split('reg_')[1].split("_")[0])
        pb_info = {'reg_index':reg_id, 'total_seq':seq, 'total_name':name};
        if reg_id not in reg_pb_dic.keys():
            reg_pb_dic[reg_id] = [pb_info]
        else:
            reg_pb_dic[reg_id].append(pb_info)
pb_lists = reg_pb_dic.values()
print "- Total candidate sequences:", len(pb_records)
# save
save_dir = 'final_probes'
if not os.path.exists(master_dir+os.sep+save_dir):
    os.makedirs(master_dir+os.sep+save_dir)
print "-- Save pb_lists"
pickle.dump(pb_lists, open(master_dir+os.sep+save_dir+os.sep+'list.pkl', 'w'));
print "-- Save pb_records"
with open(master_dir+os.sep+save_dir+os.sep+'candidate_probes.fasta', "w") as output_handle:
    SeqIO.write(pb_records, output_handle, 'fasta');

{'Final': 'CCGTTGCGCAATGCTGAATGgtcgcccgtgcttccgggacgtcgcccgtgcttccgggacCTCTGTACTACACCTGGGGTGTGTGTCTCCgtcgcccgtgcttccgggacgtcgcccgtgcttccgggacGCGTGGAGGGCATACAACGC',
 'Fwd': 'CCGTTGCGCAATGCTGAATG',
 'Original': 'AAGCGGGCGCACCCGCGCGC AAGCGGGCGCACCCGCGCGC CTCTGTACTACACCTGGGGTGTGTGTCTCC  AAGCGGGCGCACCCGCGCGC AAGCGGGCGCACCCGCGCGC',
 'Readout': '',
 'Reverse': 'GCGTGGAGGGCATACAACGC',
 'Target': 'CTCTGTACTACACCTGGGGTGTGTGTCTCC',
 '\xef\xbb\xbfGene': 'AlphaCat'}