In [4]:
import pandas as pd
import os
from Bio import SeqIO
import numpy as np
import re
import sys

In [None]:
# Supplementary material: Binding site data
bind_pos = pd.ExcelFile('S3.xls')
bind_pos = bind_pos.parse(0)
bind_pos.columns = bind_pos.iloc[0, :]
bind_pos = bind_pos.iloc[1:, :]

In [None]:
# FASTA file
## Download mm9 Refseq mRNA fasta file
!wget http://hgdownload.soe.ucsc.edu/goldenPath/mm9/bigZips/mrna.fa.gz
!gunzip mrna.fa.gz

In [None]:
## Load fasta file
fasta = SeqIO.parse(open('refMrna.fa'),'fasta')
fasta_dict = dict()
for f in fasta:
    fasta_dict[f.id] = str(f.seq)

In [None]:
# Match binding site data with fasta file
bind_pos_1 = bind_pos[bind_pos['Transcript'].isin(fasta_dict.keys())]

tmp = list()
for i in bind_pos_1.index:
    if bind_pos_1.loc[i, 'Position'] > len(fasta_dict[bind_pos_1.loc[i, 'Transcript']]):
        tmp.append(['NA', bind_pos_1.loc[i, 'Crosslinked base'], 'NA'])
    else : 
        tmp.append([fasta_dict[bind_pos_1.loc[i, 'Transcript']][bind_pos_1.loc[i, 'Position']].upper(), bind_pos_1.loc[i, 'Crosslinked base'], 
        fasta_dict[bind_pos_1.loc[i, 'Transcript']][bind_pos_1.loc[i, 'Position']-10:bind_pos_1.loc[i, 'Position']+11].upper()])

In [None]:
# Make dataframe (Column: Base in fasta, Base in binding site data, 21-mer sequence)
tmp_df = pd.DataFrame(tmp)
tmp_df

In [None]:
# Sequences to exclude
## Number of position is longer than transcript length in FASTA file, Crosslinked base is different from base in FASTA file
print([list(tmp_df[0]).count('NA'), sum([i[0] != i[1] for i in tmp])])

In [None]:
# Final sequences with length of 21
sequences = list(tmp_df.iloc[[i[0] == i[1] for i in tmp], 2])
final_seq = [i for i in sequences if len(i) == 21]

In [None]:
# Save final sequences (T converted to U) -> for WebLogo
with open('final_seq_U.txt', 'w') as f:
    for i in final_seq:
        i_ = i.replace('T', 'U')
        f.write(i_ + '\n')

In [None]:
# Label encoder
def char_to_number(seq):
    seq_dict = {'A':0, 'C':1, 'G':2, 'T':3}
    return [seq_dict[i] for i in seq]

In [None]:
# Label encoding final sequences
final_seq_df = list()
for seq in final_seq:
    final_seq_df.append(char_to_number(seq))
final_seq_df = pd.DataFrame(final_seq_df)

In [None]:
# Generate random sequences (expected to be negative control data)
random_seq = list()
not_in_binding = list(set(fasta_dict.keys()).difference(set(np.unique(bind_pos_1['Transcript']))))
random_trans = np.random.choice(not_in_binding, size = len(final_seq))
time = 0
for i in random_trans:
    print(str(time) + ' / ' + str(len(random_trans)))
    random_pos = np.random.choice(range(10, len(fasta_dict[i]) - 10), size = 1)
    random_seq.append(char_to_number(fasta_dict[i][random_pos[0]-10:random_pos[0]+11].upper()))
    time += 1
random_seq_df = pd.DataFrame(random_seq)

In [None]:
# Make total sequences
random_seq_df['label'] = np.repeat(0, len(random_seq_df.index))
final_seq_df['label'] = np.repeat(1, len(final_seq_df.index))
total_df = pd.concat([random_seq_df, final_seq_df])
total_df

In [None]:
# Save total sequences
total_df.to_csv('total_sequences.csv')