In [82]:
import pandas as pd
import numpy as np
from math import *

In [160]:
def step1(seq):
    dict_symbols = {
        'P':'★',
        'G':'⯁',
        'T':'☐',
        'S':'▣'
    }
    output = ''
    for i in range(len(seq)):
        if seq[i] in dict_symbols:
            output += dict_symbols[seq[i]]
        else:
            output += seq[i]
    return output

def step2(seq):
    # list of contiguous strong hydrophobic amino acids
    list_aa = ['V', 'I', 'L', 'F', 'M', 'Y', 'W']
    output = ''
    for i in range(len(seq)):
        if seq[i] in list_aa:
            output += '1'
        else:
            output += '0'
    return output

def step3(seq):
    boundaries = set()
    for i in range(len(seq)-3):
        if seq[i:i+4]=='0000':
            for j in range(4):
                boundaries.add(i+j)
    return boundaries

def step4(seq, boundaries):
    n = len(seq)
    double_helix = np.empty((8, ceil(n/4)+1), dtype = "object")
    for i in range(n):
        if i not in boundaries:
            line = i%4
            column = int(i/4)
            double_helix[line, 1+column] = seq[i]
            double_helix[line+4, column] = seq[i]
    double_helix[double_helix==None] = '-'
    return double_helix

def step5(seq, boundaries):
    structs = []
    struct = ''
    for i in range(len(seq)):
        if i not in boundaries:
            struct += seq[i]
        elif struct!='':
            structs.append(struct)
            struct = ''
    return structs

def binary_coding(seq):
    # remove "."
    seq = seq.replace('.', '')
    # transform into another form with special symbols denoting amino acids with particular structural behaviours
    seq1 = step1(seq)
    # transform into binary sequence
    seq2 = step2(seq)
    # find boundaries where there are not particular structure (helix or sheet)
    boundaries = step3(seq2)
    # vizualize horizontal clusters
    mat = step4(seq1, boundaries)
    # get potential structures
    structs = step5(seq1, boundaries)
    
    return seq1, seq2, boundaries, mat, structs

In [161]:
seq1, seq2, boundaries, mat, structs = binary_coding("AIQTSDEH...KV..ATPANW....RP..................GDK.V.VVP.PP.NTQE.MA..EERMKEG")

In [162]:
for line in mat:
    print(str(line))

['-' 'A' '-' '-' '-' '-' '-' '-' '-' '-' 'M']
['-' 'I' '-' 'V' '-' '-' 'V' '-' '-' '-' 'K']
['-' '-' '-' '-' '-' '-' 'V' '-' '-' '-' 'E']
['-' '-' '-' '-' 'W' '-' 'V' '-' 'M' '-' '⯁']
['A' '-' '-' '-' '-' '-' '-' '-' '-' 'M' '-']
['I' '-' 'V' '-' '-' 'V' '-' '-' '-' 'K' '-']
['-' '-' '-' '-' '-' 'V' '-' '-' '-' 'E' '-']
['-' '-' '-' 'W' '-' 'V' '-' 'M' '-' '⯁' '-']


In [143]:
seq2

'0100000001000001000001110000000100001000'

In [163]:
structs

['AI', 'V', 'W', 'VVV', 'M']