In [85]:
import pandas as pd
import numpy as np
from os import path
import gtfparse
import pyensembl
pyensembl.EnsemblRelease(release=87)
import gspread
from  oauth2client.service_account  import ServiceAccountCredentials
import gspread
import sys
if sys.version_info[0] < 3: 
    from StringIO import StringIO
else:
    from io import StringIO

In [192]:
scope = ['https://spreadsheets.google.com/feeds',
        'https://www.googleapis.com/auth/drive']
path = '/home/zach/Documents/Untitled Folder/Cornea.json'
creds = ServiceAccountCredentials.from_json_keyfile_name(path, scope)
client = gspread.authorize(creds)
# Start with a gene list 
expression_worksheet = client.open("Cornea Wound Gene List").sheet1
expression = expression_worksheet.get_all_records()
expression_df = pd.DataFrame(expression)

INFO:oauth2client.client:Refreshing access_token


In [59]:
# removing trnascript versions from worksheet
# Select a range
Transcript_ID = expression_worksheet.range('G2:G189')
# Edit range
for cell in Transcript_ID:
    cell.value = cell.value.split('.')[0]
# Update in batch
expression_worksheet.update_cells(Transcript_ID)

{'spreadsheetId': '11MMvEY4s8oUIGf5kyoA4QOKs-Br2myyNooNmSO7-nEU',
 'updatedCells': 188,
 'updatedColumns': 1,
 'updatedRange': 'Sheet1!G2:G189',
 'updatedRows': 188}

In [5]:
def col_cells(worksheet, col):
    """Returns a range of cells in a `worksheet`'s column `col`."""
    start_cell = worksheet.get_addr_int(1, col)
    end_cell = worksheet.get_addr_int(worksheet.row_count, col)

    return worksheet.range('%s:%s' % (start_cell, end_cell))

In [7]:
def read_codebook(cbook_fname):
    """Read CSV of n-bit codewords."""
    cwords = []
    with open(cbook_fname, 'r') as f:
        column_name = f.readline().strip()
        for l in f.readlines():
            cwords.append(l.strip())
    return cwords
codewords = read_codebook('/home/zach/Documents/Untitled Folder/cbook_140MHD4_200MHD2.txt') # need to fill in
np.random.shuffle(codewords)
print('Number of codewords in codebook -', len(codewords))

Number of codewords in codebook - 200


In [8]:
readout_names = ['RS0095', 'RS0109', 'RS0175', 'RS0237', 'RS0307', 'RS0332', 'RS0384', 'RS0406', 
                'RS0451', 'RS0468', 'RS0548', 'RS64.0', 'RS156.0', 'RS278.0', 'RS313.0', 'RS643.0', 
                'RS740.0', 'RS810.0']
def write_codebook(rows, fname, readout_names, codebook_style = '148MHD4'):
    with open(fname, 'w') as f:
        f.write('version'+','+str(1)+'\n')
        f.write('codebook_name'+','+codebook_style+'\n')
        f.write('bit_names,'+','.join(readout_names)+'\n')
        f.write('name, id, barcode\n')
        for row in rows:
            f.write(','.join([row[0], row[1], row[2]+'\n']))

In [74]:
def geneSymbol_to_ensembl(gene_symbol_list, biomart_download_fname,
                          organism='mouse', min_length=1000):
    """
    Look up the ensembl of gene symbols.
    """
    annotations = []
    with open(biomart_download_fname, 'r') as f:
        transcript_df = pd.read_csv(f)
        genes = [i.value for i in gene_symbol_list if i != '']
        transcript_df = transcript_df[transcript_df[u'Gene name'].isin(genes)]
    for cell in gene_symbol_list:
        gene = cell.value
        if gene == '':
            continue

        transcripts = transcript_df[transcript_df[u'Gene name'] == gene].drop_duplicates('Transcript stable ID')
        transcripts = transcripts[transcripts[u'Transcript type']==u'protein_coding'].sort_values('Transcript length (including UTRs and CDS)', ascending=False)

        if len(transcripts)==0:
            print('Failed finding: ', gene)
            annotations.append((gene, None))
        elif transcripts.iloc[0]['Transcript length (including UTRs and CDS)'] > min_length:
            annotations.append((gene, transcripts))
        else:
            print('Gene too short: ', gene)
            annotations.append((gene, transcripts))
    return annotations
#

In [78]:
def ensymbl_to_geneinfo(transcript_IDs, biomart_download_fname,
                          organism='mouse', min_length=1000):
    annotations = []
    with open(biomart_download_fname, 'r') as f:
        transcript_df = pd.read_csv(f)
        genes = [i.value for i in transcript_IDs if i != '']
        transcript_df = transcript_df[transcript_df[u'Gene stable ID'].isin(genes)]
    for cell in transcript_IDs:
        gene = cell.value
        if gene == '':
            continue
        transcripts = transcript_df[transcript_df[u'Gene stable ID'] == gene].drop_duplicates('Transcript stable ID')
        transcripts = transcripts[transcripts[u'Transcript type']==u'protein_coding'].sort_values('Transcript length (including UTRs and CDS)', ascending=False)
        if len(transcripts)==0:
            print('Failed finding: ', gene)
            annotations.append((gene, None))
        elif transcripts.iloc[0]['Transcript length (including UTRs and CDS)'] > min_length:
            annotations.append((gene, transcripts))
        else:
            print('Gene too short: ', gene)
            annotations.append((gene, transcripts))
    return annotations
#

In [207]:
def Trascriptome_append(transcript_IDs,file):
    annotations = []
    with open(file, 'r') as f:
        transcriptome_df = pd.read_csv(f)
        genes = [i.value for i in transcript_IDs if i != '']
        transcriptome_df = transcriptome_df[transcriptome_df[u'transcript_id'].isin(genes)] 
    for cell in transcript_IDs:
        gene = cell.value
        if gene == '':
            continue
        transcripts = transcript_df[transcript_df[u'transcript_id'] == gene]
        if len(transcripts)==0:
            print('Failed finding: ', gene)
            annotations.append((gene, None))
        else:
            annotations.append((gene, transcripts))
    return annotations 

In [213]:
# Add FPKM data
transcript_IDs = expression_worksheet.range('G2:G189')
file = '/bigstore/GeneralStorage/Rob/merfish/MERFISH_analysis-master/mouse/Cornea/Expression/no_versions_isoforms_tracking.csv'
Transcriptome_annotations = Trascriptome_append(transcript_IDs,file)
FPKM = expression_worksheet.range('C2:C189')
for idx, g in enumerate(Transcriptome_annotations):
    info = g[1]
    if info is None:
        continue
    fpkm = info['FPKM'].iloc[0]
    FPKM[idx].value = fpkm
expression_worksheet.update_cells(FPKM)

Failed finding:  ENSMUST00000071134


{'spreadsheetId': '11MMvEY4s8oUIGf5kyoA4QOKs-Br2myyNooNmSO7-nEU',
 'updatedCells': 188,
 'updatedColumns': 1,
 'updatedRange': 'Sheet1!C2:C189',
 'updatedRows': 188}

In [90]:
# Add info to spreadsheet
# WARNING - you must figure out column numbers and change bits for new gene sets
gene_annotations = ensymbl_to_geneinfo(expression_worksheet.range('F2:F205'), '/home/zach/Documents/Untitled Folder/mouse_gene_info2.txt', organism='mouse')
gname = expression_worksheet.range('F2:F189')
tname = expression_worksheet.range('G2:G189')
length = expression_worksheet.range('D2:D189')
descript = expression_worksheet.range('B2:B189')
gene = expression_worksheet.range('A2:A189')
update = []
for idx, g in enumerate(gene_annotations):
    info = g[1]
    if info is None:
        continue
    max_transcript = info['Transcript stable ID'].iloc[0]
    max_length = info['Transcript length (including UTRs and CDS)'].iloc[0]
    descript_val = info['Gene description'].iloc[0]
    gene_name = info['Gene name'].iloc[0]
    tname[idx].value = max_transcript
    length[idx].value = str(max_length)
    descript[idx].value = str(descript_val)
    gene[idx].value = str(gene_name)
expression_worksheet.update_cells(tname)
expression_worksheet.update_cells(length)
expression_worksheet.update_cells(descript)
expression_worksheet.update_cells(gene)

Gene too short:  ENSMUSG00000046259
Gene too short:  ENSMUSG00000048455
Gene too short:  ENSMUSG00000056054
Gene too short:  ENSMUSG00000044303
Gene too short:  ENSMUSG00000049775
Gene too short:  ENSMUSG00000001131
Gene too short:  ENSMUSG00000056071


{'spreadsheetId': '11MMvEY4s8oUIGf5kyoA4QOKs-Br2myyNooNmSO7-nEU',
 'updatedCells': 188,
 'updatedColumns': 1,
 'updatedRange': 'Sheet1!A2:A189',
 'updatedRows': 188}

In [10]:
# Generate Codebook
row_tuples = []

for idx, row in expression_df.drop_duplicates('Transcript ID').iterrows():
    row_tuples.append((row['Gname'], row['Transcript ID'].split('.')[0], str(row['barcodes'])))

write_codebook(row_tuples, '/home/zach/Documents/Untitled Folder/Cornea.txt', readout_names)
row_tuples

In [124]:
def parse_merfish_oligos(fname, counts_df = None,
                         counts_df_column='FPKM', tid_column='transcript_id'):
    """
    Bit hacky - should refactor and figure out how to handle missing isoform specificity info.
    """
    from Bio import SeqIO
    import pandas as pd
    df = pd.DataFrame(columns=['experiemnt', 'pleft', 'ro1', 'ro2', 'ro3', 
                               'pright', 'seq', 'gene', 'tid', 
                              'start', 'length', 'gc', 'tm', 'specicity'])
    readout_dict = {}
    oligos = SeqIO.FastaIO.SimpleFastaParser(open(fname, 'r'))
    rows = []
    fpkms = []
    for header, seq in oligos:
        fields = header.split(' ')
        experiment = str(fields[0])
        primer_left = str(fields[1])
        primer_seqL = seq[:20]
        primer_seqR = seq[-20:]
        readout1 = str(fields[2])
        readout_dict[readout1] = seq[20+1:20+1+20]
        isoSpecificity = 1
        # Order is different if Readouts are RO1/RO2 - encoding - RO3 vs RO1 - encoding - RO2/RO3
        # Check with if statement and handle accordingly.
        if '__' not in fields[3]:
            readout2 = str(fields[3])
            encoding = str(fields[4])
            readout3 = fields[5]
            ro2_start_idx = 41
            ro3_start_idx = 92
            readout_dict[readout3] = seq[20+20+20+1+30+1:20+1+20+20+30+1+20]
            readout_dict[readout2] = seq[20+1+20:20+1+20+20]
            primer_right = fields[6]
            gene, tid, start, length, gc, tm, specificity  = encoding.split('__')


#             gene, tid, start, length, gc, tm, specificity, isoSpecificity  = encoding.split('__')
            encoding_region = seq[20+1+20+20:20+20+1+20+30]
        else:
            encoding = fields[3]
#             isoSpecificity = fields[4]
            readout2 = fields[4]
            readout3 = fields[5]
            ro2_start_idx = 72
            ro3_start_idx = 92
            readout_dict[readout2] = seq[20+2+20+30:20+20+30+20+2]
            readout_dict[readout3] = seq[20+2+20+30+20:20+20+30+20+20+2]
            primer_right = fields[6]
            gene, tid, start, length, gc, tm, specificity  = encoding.split('__')
#             gene, tid, start, length, gc, tm, specificity, isoSpecificity  = encoding.split('__')
        # IMPLEMENT READOUT DICT In If Else
            encoding_region = seq[20+1+20:20+1+20+30]
        
        rows.append([experiment, primer_seqL, readout1, readout2, readout3, 
                       primer_seqR, encoding_region, seq, gene, tid, start,
                     length, gc, tm, specificity, isoSpecificity, header])
    df = pd.DataFrame(rows, columns=['experiment', 'pleft', 'ro1', 'ro2', 'ro3', 
                               'pright', 'encodingRegion', 'seq', 'gene', 'tid', 
                              'start', 'length', 'gc', 'tm', 'specificity', 'isoSpecificity', 'header'])
    df = df.drop_duplicates(subset=['gene', 'encodingRegion'])
#     if isinstance(counts_df, pd.DataFrame):
#         for tid in df.tid.unique():
#             fpkm = counts_df[counts_df[tid_column]==tid][counts_df_column]
#             tid_idx = df[df.tid==tid].index
#             for i in tid_idx:
#                 df.set_value(i, counts_df_column, fpkm.values[0])
#     df = df.convert_objects(convert_numeric=True)
#     df.sort_values(['gene', 'specificity', 'isoSpecificity'], ascending=False, inplace=True)
#     df['iso_off_spots'] = (df[counts_df_column] - df['isoSpecificity']*df[counts_df_column])/df['isoSpecificity']
#     df['gene_off_spots'] = (df[counts_df_column] - df['specificity']*df[counts_df_column])/df['specificity']
#     df = df.drop_duplicates('tid')
    return df, primer_seqL, primer_seqR, readout_dict

def trim_oligos_to_fit(oligo_df, multi_transcripts_cutoff = 148, min_oligos=48):
    df2 = oligo_df.copy()
    c = Counter(df2.gene)
    high_count = {}
    for g, count in c.items():
        if count < min_oligos:
            print(g, count)
#             c.pop(g)
            if g not in ['SNAI2', 'SNAI1', 'ORAI1', 'P2RY11', 'INPP1', 'ACTA2', 'PICK1']:
                df2.drop(df2[df2.gene==g].index, inplace=True)
#         if count>multi_transcripts_cutoff:
#             high_count[g] = count
#             ixes = list(df2[df2.gene==g].index)
#             ixes = np.random.choice(ixes, size=multi_transcripts_cutoff, replace=False)
#             df2.drop(ixes, inplace=True)
    return df2

# def balance_readouts(df, per_tid=64, fa_out='mergos.fa'):
#     from itertools import repeat
#     tids = df.groupby(group)
#     f = open(fa_out, 'w')
#     new_df = pd.DataFrame()
#     counters = []
#     for name, group in tids:
#         r_used = pd.unique(np.concatenate((group.ro1.unique(),group.ro2.unique(),group.ro3.unique())))
        
def balance_readouts(df, primersL, primersR, readouts, per_tid=64, group='tid',
                     fa_out='python_mergos.fa', sep='__'):
    verbose=False
    from itertools import repeat
    tids = df.groupby(group)
    f = open(fa_out, 'w')
    new_df = pd.DataFrame()
    counters = {}
    for name, group in tids:
        counts = Counter()
        r_used = pd.unique(np.concatenate((group.ro1.unique(),group.ro2.unique(),group.ro3.unique())))
        r_used = np.concatenate(list(repeat(r_used, 1000)))
        oligo_index = group.index.tolist()
        np.random.shuffle(oligo_index)
        oligo_index = oligo_index[:per_tid]
        base_idx = 0
        c = Counter()
        for i, idx in enumerate(oligo_index):
            ro1_seq = ''
            ro2_seq = ''
            ro3_seq = ''
            oligo = ''
            ro1=''
            ro2=''
            ro3=''
#             try:
            ro1_seq = readouts[r_used[base_idx]]
            ro1 = r_used[base_idx]
        
            ro2_seq = readouts[r_used[base_idx+1]]
            ro2 = r_used[base_idx+1]
            
            ro3_seq = readouts[r_used[base_idx+2]]
            ro3 = r_used[base_idx+2]
            
            c.update([ro1, ro2, ro3])
            row = group.loc[idx]
            rand = np.random.randint(0, high=2)
            if (ro1 not in r_used) or (ro2 not in r_used) or (ro3 not in r_used):
                print(row)
            if rand:
                oligo = row.pleft+'A'+ro1_seq+ro2_seq+row.encodingRegion+'A'+ro3_seq+row.pright

#                     row.set_value(idx, 'oligo', row.pleft+row.ro1+row.ro2+'A'+row.encodingRegion+'A'+row.ro3+row.pright)
            else:
                oligo = row.pleft+'A'+ro1_seq+row.encodingRegion+'A'+ro2_seq+ro3_seq+row.pright
            if (len(row.encodingRegion) != 30) or (len(ro1_seq) != 20):

                print(len(row.encodingRegion), len(ro1_seq))
#                     row.set_value(idx, 'oligo', row.pleft+row.ro1+'A'+row.encodingRegion+'A'+row.ro2+row.ro3+row.pright)
            header = ">"+row.gene+sep+row.tid+sep+str(row.start)+sep+ro1+sep+ro2+sep+ro3+sep+row.experiment+'\n'
            f.write(header)
            f.write(oligo+'\n')
            base_idx += 3
#             except Exception as e:
#                 print(e)
#                 continue
        counters[name] = c
        if len(c.keys())>4:
            print(name)
    f.close()
    return new_df, fa_out, counters

In [133]:
# update spreadsheet with Probe design data
# Sequences, FPKM, number of probes
df = parse_merfish_oligos('/bigstore/GeneralStorage/Rob/merfish/MERFISH_analysis-master/mouse/Cornea_Wound/Cornea_Wound_oligos.fasta')[0]
from collections import Counter
counts = Counter(df.gene)
num_oligos = expression_worksheet.range('E2:E189')
gnames = expression_worksheet.range('A2:A189')
for k, v in counts.items():
    idx = [i for i, g in enumerate(gnames) if g.value==k][0]
    num_oligos[idx].value = v
expression_worksheet.update_cells(num_oligos)

{'spreadsheetId': '11MMvEY4s8oUIGf5kyoA4QOKs-Br2myyNooNmSO7-nEU',
 'updatedCells': 188,
 'updatedColumns': 1,
 'updatedRange': 'Sheet1!E2:E189',
 'updatedRows': 188}