In [47]:
from Bio import SeqIO, Seq
import csv

In [53]:
def get_seq(start, strand, genome='Genome Info/U18466.2.fasta', extend=100):
    if genome.startswith('Genome'):
        for rec in SeqIO.parse(open(genome), 'fasta'):
            genome = str(rec.seq)
    if strand == '+':
        return genome[start-1-extend : start+extend]
    return str(Seq.reverse_complement(genome[start-1-extend : start+extend]))

def get_overlapped(start, end, strand, tss_poss):
    for tss_pos, tss_strand in tss_poss:
        if tss_strand == strand and start <= tss_pos and end >= tss_pos:
            return tss_pos, tss_strand
    return False

def count_lgaps(seq):
    n = 0
    for x in seq:
        if x == '-':
            n += 1
        else:
            return n
    return n

def get_upstream_perfect_match(seq, ref, start, end, tss, strand, n_inside=5, trimG=True):
    if strand == '+':
        ref_upstream = ref[(tss-start): (tss-start)+10]
        ref_up = {'upstream': ref[:tss-start], 'TSS': ref[tss-start], 
                  'downstream': seq[tss-start+1:tss-start+n_inside]}
    else:
        ref_upstream = ref[(end-tss): (end-tss)+10]
        ref_up = {'upstream': ref[:end-tss], 'TSS': ref[end-tss], 'downstream': seq[end-tss+1:end-tss+n_inside]}
    try:
        seq_i = seq.index(ref_upstream)
    except ValueError:
        raise ValueError("No perfect match found for", ref_upstream)
    seq_up = {'upstream': seq[:seq_i], 'TSS': seq[seq_i], 'downstream': seq[seq_i+1:seq_i+n_inside]}
    if trimG:
        if seq_up['upstream'].startswith('G'):
            seq_up['upstream'] = seq_up['upstream'][1:]
            ref_up['upstream'] = ref_up['upstream'][1:]
    
    return {'seq': seq_up, 'ref': ref_up}

def get_upstream_aln(seq, ref, start, end, tss, strand, downstream=5):
    alns = pairwise2.align.globalms(seq, ref, 1, -10, -10, -0.1, one_alignment_only=True)
    seq_aln, ref_aln = alns[0][:2]
    if strand == '+':
        ref_start = start - len([x for x in ref_aln.strip('-') if x == '-'])
        seq_start = ref_start
        if seq_aln.startswith('-'):
            seq_start = ref_start + count_lgaps(seq_aln)
        shift = tss - seq_start
    else:
        ref_end = end + len([x for x in ref_aln.strip('-') if x == '-'])
        seq_end = ref_end
        if seq_aln.startswith('-'):
            seq_end = ref_end - count_lgaps(seq_aln)
        shift = seq_end - tss
    return seq[:(shift+downstream)][::-1], shift

def count_mismatches(seq, ref, first=25):
    if seq.startswith('G'):
        seq = seq[1:]
        ref = ref[1:]
    return len([i for i in range(first) if seq[i] != ref[i]])

def get_ref(genome, tss, strand, additional=10):
    if strand == '+':
        return genome[tss-1-additional:tss-1] + ' ' + genome[tss-1] + ' ' + genome[tss:tss+additional]
    else:
        seq = genome[tss-1-additional:tss-1] + ' ' + genome[tss-1] + ' ' + genome[tss:tss+additional]
        return str(Seq.reverse_complement(seq))

In [50]:
for rec in SeqIO.parse(open('Genome Info/U18466.2.fasta'), 'fasta'):
    genome = str(rec.seq)

In [28]:
ref = 'ACCCCCAAGAGAGAGGTTGGCTCACTATAGCTAGCTATAGCTAGCTATAGCTAGTAGGCAATCTACCAGTACTTTGTGGAC'
seq = 'GAGAGAGAGAGAGAGGTTGGCTCACTATAGCTAGCTATAGCTAGCTATAGCTAGTAGGCAATCTACCAGTACTTTGTGGAC'
print(get_upstream_perfect_match(seq, ref, 1, 100, 10, '+'))
print(get_upstream_perfect_match(seq, ref, 1, 100, 91, '-'))

{'seq': {'upstream': 'AGAGAGAG', 'TSS': 'A', 'downstream': 'GAGA'}, 'ref': {'upstream': 'CCCCCAAG', 'TSS': 'A', 'downstream': 'GAGA'}}
{'seq': {'upstream': 'AGAGAGAG', 'TSS': 'A', 'downstream': 'GAGA'}, 'ref': {'upstream': 'CCCCCAAG', 'TSS': 'A', 'downstream': 'GAGA'}}


In [57]:
tss_positions = {(int(x.split()[2]), x.split()[-1]): x.split()[3] for x in \
                 open('Genome Info/158_pTSS_NA_pTSS_check_NEW_incl_alt_TSSs.bed')}
tss_additions = {}

for file in ['Early-5h/S1_comparison.txt', 'Early-5h/S2_comparison.txt', 
             'Late-16h/S3_comparison.txt', 'Late-16h/S4_comparison.txt']:
    repl = file.split('/')[1].split('_')[0]
    for row in csv.DictReader(open(file), delimiter='\t'):
        start, end, strand = int(row['start'])+1, int(row['end']), row['strand']
        tss_pos_strand = get_overlapped(start, end, strand, tss_positions)
        if tss_pos_strand:
            tss_pos, tss_strand = tss_pos_strand
            gene = tss_positions[tss_pos_strand]
            seq, ref = row['sequenced'], row['reference']
            if gene not in tss_additions:
                tss_additions[gene] = {}
                tss_additions[gene]['seq'] = get_ref(genome, tss_pos, tss_strand)
            if repl not in tss_additions[gene]:
                tss_additions[gene][repl] = {'downstream not found': 0, 
                                             'perfect matches': 0,
                                             'perfect matches but longer': 0,
                                             'total added sequences': 0,
                                             'upstream sequences': {}}
            try:
                upstream = get_upstream_perfect_match(seq, ref, start, end, tss_pos, strand)
                if upstream['seq']['upstream'] == '':
                    tss_additions[gene][repl]['perfect matches'] += 1
                else:
                    seq_up = upstream['seq']['upstream']
                    if seq_up == upstream['ref']['upstream']:
                        tss_additions[gene][repl]['perfect matches but longer'] += 1
                    else:
                        tss_additions[gene][repl]['total added sequences'] += 1
                        if seq_up not in tss_additions[gene][repl]['upstream sequences']:
                            tss_additions[gene][repl]['upstream sequences'][seq_up] = 0
                        tss_additions[gene][repl]['upstream sequences'][seq_up] += 1
            except ValueError:
                tss_additions[gene][repl]['downstream not found'] += 1

In [58]:
tss_additions

{'A151R': {'seq': 'AATAATAGTT A TGATGGCGTT',
  'S1': {'downstream not found': 1363,
   'perfect matches': 34768,
   'perfect matches but longer': 92,
   'total added sequences': 464,
   'upstream sequences': {'C': 271,
    'ATATAGTT': 18,
    'A': 37,
    'ACACGACGCTCTTCCGATCTG': 17,
    'ACACGACGCTCTTCCGATCT': 19,
    'ACACGACGCTCTTCCG': 1,
    'TG': 20,
    'G': 38,
    'TC': 2,
    'ATG': 7,
    'AAGCAGCTCCAGCCTACACTAGGGAAT': 1,
    'T': 1,
    'CC': 2,
    'ATGATT': 1,
    'CCGCGCAGATG': 1,
    'ACATGACGCTCTTCCGAT': 1,
    'AG': 4,
    'ACGACGCTCTTCCGATCT': 1,
    'ATGTT': 1,
    'CACGAAGCTCTTCCGATCT': 1,
    'ACACTACGCTCTTCAGATCTG': 1,
    'AT': 2,
    'CTG': 2,
    'ACACGACGCTCTTCCGAT': 1,
    'ACACGACGCTCTTCCGA': 1,
    'N': 4,
    'ATAGTAGTT': 1,
    'CG': 2,
    'ACACGACGATCTTCCGATCT': 1,
    'CACGACGCTCTTCCGATCTG': 1,
    'ATGATAGTT': 1,
    'ACGACGCTCTTCCGATCTG': 1,
    'ATATATAGTT': 1,
    'GGG': 1}},
  'S2': {'downstream not found': 1349,
   'perfect matches': 35290,
   'p

In [69]:
for gene in tss_additions:
    for repl in ['S1', 'S2', 'S3', 'S4']:
        if repl in tss_additions[gene]:
            total = sum([x for x in tss_additions[gene][repl].values() if type(x) is int])
            n_added = tss_additions[gene][repl]['total added sequences']
            added_fraction = n_added / total * 100
            if added_fraction >= 10:
                print(gene, repl, tss_additions[gene]['seq'], '%.1f%%' % added_fraction, total, sep='\t')
                seq_n = list(tss_additions[gene][repl]['upstream sequences'].items())
                seq_n.sort(key=lambda x: x[1], reverse=True)
                for seq, count in seq_n:
                    if len(seq) > 10 or count < 2:
                        continue
                    print(seq, count, sep='\t')
                print()

I73R	S1	AAAAAGAAGT A TACTCTCCTT	18.7%	30170
AT	4833
C	269
ATAT	193
A	68
CAT	31
G	16
TAT	14
AA	14
AAT	8
TT	6
ATATAT	5
ACT	5
GAT	4
N	4
TG	4
CG	2
AC	2
TTAT	2
NAT	2

I73R	S2	AAAAAGAAGT A TACTCTCCTT	18.8%	30441
AT	4956
C	319
ATAT	189
CAT	27
AA	20
A	15
G	12
TG	6
TT	6
TAT	6
ATATAT	3
AAT	3
N	3
GAT	3
TATAT	2
CG	2

I73R	S3	AAAAAGAAGT A TACTCTCCTT	18.8%	24712
AT	3853
C	343
ATAT	135
CAT	31
A	12
G	10
TT	7
TAT	6
ATATAT	6
AA	5
AAT	5
TG	4
ACT	3
TTAT	3
CAGT	3
NAT	2
TATAT	2

I73R	S4	AAAAAGAAGT A TACTCTCCTT	19.1%	24037
AT	3769
C	362
ATAT	140
A	21
CAT	20
G	11
AA	8
TT	8
ATATAT	5
N	5
TAT	3
AAT	3
CATATAT	2
ACT	2

U104L	S3	TTTTCGTAAT A TAACACTACA	10.2%	4537
ATAT	37
C	13
ATA	11
A	5
ATAAT	3

NA2	S3	AAATAAATAC A GAGAGGTTGG	17.2%	22292
ATATAC	891
ATATATAC	430
ATATAAATAC	318
AG	304
G	38
ATATATATAC	36
GATATAC	30
AAAATAC	26
ATAAAATAC	23
AGAG	17
TTAC	13
ATAATAC	13
AAAC	9
TG	9
AAAAATAC	9
ATAAAAAC	8
T	7
GATAC	7
ATACATAC	7
CATAC	6
GATATATAC	6
A	6
ATAA	6
CATATAC	6
GATAAATAC	5
CATAAATAC	4
ATAAATAC	4
CATATATAC	4
ATAAATAA	4

ATAAAA	2
GTATATAC	2
AAATATAA	2
ATATATGC	2
TTAATATAC	2
TAATAC	2
ATGTATAC	2
ATAATAAAC	2
TAAATATAC	2
ACACAC	2
CATATAA	2
CATAAAC	2
ATATTTAC	2
ATATNTAC	2
TCAT	2
ATAGA	2
ATATATTC	2
ATATAGAG	2
CTGAC	2
ATAAATATAC	2
ATATAGTAT	2
ATATATACAC	2
TGATATAC	2

KP360L	S1	TTCGGAAATA A TTATTTTGCA	11.1%	153
ATT	8
C	4
ATTATTTT	2

KP360L	S3	TTCGGAAATA A TTATTTTGCA	11.2%	80
ATATA	3
T	2

KP360L	S4	TTCGGAAATA A TTATTTTGCA	10.2%	98
ATTT	2

A505R	S1	TTTGGTAAAC A AATGTTTTCT	61.9%	97
A	57

A505R	S2	TTTGGTAAAC A AATGTTTTCT	56.2%	121
A	63
CA	2

A505R	S3	TTTGGTAAAC A AATGTTTTCT	23.8%	261
A	50

A505R	S4	TTTGGTAAAC A AATGTTTTCT	20.7%	246
A	36
T	2

O174L	S1	TTAATATTAA A TATAAAATGT	25.6%	39
AT	7
ATA	3

O174L	S2	TTAATATTAA A TATAAAATGT	27.3%	44
AT	10
ATA	2

O174L	S3	TTAATATTAA A TATAAAATGT	15.2%	1762
AT	170
ATATATTAA	27
ATA	22
C	15
ATAT	6
ATATA	6
ATAA	3
CAT	2
AAATA	2

O174L	S4	TTAATATTAA A TATAAAATGT	15.4%	1489
AT	142
ATA	28
ATATATTAA	19
C	8
ATATA	7
ATAT	6
CAT	3
TAT	2

DP60R	S1	AGCGGTAATA A TAATTGATAC	42.1%	19
AT	4
A	2

DP

TAT	2
ATAAT	2

CP530R	S4	AAATTATAAA A TAATAAGAAG	15.7%	7252
AT	987
ATAT	53
ATA	34
C	11
ATATA	7
T	4
CAT	4
TT	3
G	2
TAT	2

C129R	S2	TACTTTTATT A TATATATGGA	36.4%	11

C129R	S3	TACTTTTATT A TATATATGGA	23.8%	5160
AT	137
ATAT	8
C	6
TAT	3
AA	2
ATATAT	2

C129R	S4	TACTTTTATT A TATATATGGA	22.9%	7784
AT	187
ATAT	13
C	8
TAT	2

I177L	S1	GAATATAATA A TCTTTTAATG	100.0%	2

I177L	S2	GAATATAATA A TCTTTTAATG	71.4%	7

H124R	S1	AAAAATCATT A TAAAATGAAT	100.0%	1

H124R	S3	AAAAATCATT A TAAAATGAAT	38.2%	2063
AT	608
ATAT	96
ATA	17
ATATAT	13
ATATA	9
TATAT	7
C	6
CAT	4
AGCG	2
A	2

H124R	S4	AAAAATCATT A TAAAATGAAT	33.9%	2064
AT	545
ATAT	88
ATA	12
ATATAT	7
CAT	6
AA	4
TATAT	4
C	4
ATATA	4
GAT	3
A	3
ATAA	2
ATAAT	2
ATATATA	2

H339R	S1	ACTAACTAAT A AATGGCCGGT	25.0%	4

H339R	S2	ACTAACTAAT A AATGGCCGGT	12.5%	8

H339R	S3	ACTAACTAAT A AATGGCCGGT	29.5%	14457
A	1620
AAA	1168
AA	457
AAAA	435
AAAAAA	170
AAAAA	97
C	97
AAAAAAA	84
AAAAAAAA	36
ATAT	8
CAAA	7
AAAAAAAAA	6
TAA	5
CAA	5
ATAAT	5
TA	4
TAAA	3
N	3
CA	2

H339R	S4	ACTAACTAAT A 