In [29]:
import pandas as pd
import numpy as np
from math import *

In [35]:
def step1(seq):
    dict_symbols = {
        'P':'★',
        'G':'⯁',
        'T':'☐',
        'S':'▣'
    }
    output = ''
    for i in range(len(seq)):
        if seq[i] in dict_symbols:
            output += dict_symbols[seq[i]]
        else:
            output += seq[i]
    return output

def step2(seq):
    # list of contiguous strong hydrophobic amino acids
    list_aa = ['V', 'I', 'L', 'F', 'M', 'Y', 'W']
    output = ''
    for i in range(len(seq)):
        if seq[i] in list_aa:
            output += '1'
        else:
            output += '0'
    return output

def step3(seq):
    boundaries = set()
    for i in range(len(seq)-3):
        if seq[i:i+4]=='0000':
            for j in range(4):
                boundaries.add(i+j)
    return boundaries # (n-3)+4*#boundaries

def step4(seq, boundaries):
    n = len(seq)
    double_helix = np.empty((8, ceil(n/4)+1), dtype = "object")
    for i in range(n):
        if i not in boundaries:
            line = i%4
            column = int(i/4)
            double_helix[line, 1+column] = seq[i]
            double_helix[line+4, column] = seq[i]
    double_helix[double_helix==None] = '-'
    return double_helix

def step5(seq, boundaries):
    structs = []
    struct = ''
    for i in range(len(seq)):
        if i not in boundaries:
            struct += seq[i]
        elif struct!='':
            structs.append(struct)
            struct = ''
    return structs

def step6(structs):
    Q_codes = []
    for struct in structs:
        Q_code =''
        for i in range(len(struct)-1):
            if struct[i:i+2]=='11':
                Q_code+='V'
            elif struct[i:i+3]=='101':
                Q_code+='M'
            elif struct[i:i+4]=='1001':
                Q_code+='U'
            elif struct[i:i+5]=='10001':
                Q_code+='D'
        if Q_code!='':
            Q_codes.append(Q_code)
    return Q_codes

def binary_coding(seq):
    # remove "." and "-"
    seq = seq.replace('.', '')
    seq = seq.replace('-', '')
    # transform into another form with special symbols denoting amino acids with particular structural behaviours
    seq1 = step1(seq)
    # transform into binary sequence
    seq2 = step2(seq)
    # find boundaries where there are not particular structure (helix or sheet)
    boundaries = step3(seq2)
    # vizualize horizontal clusters
    mat = step4(seq1, boundaries)
    # get potential structures
    structs = step5(seq2, boundaries)
    # get Q-code
    Q_codes = step6(structs)
    
    return seq1, seq2, boundaries, mat, structs, Q_codes

In [33]:
def read_data(file):
    f = open(file, 'r')
    data = []
    data_i = []
    for line in f.readlines():
        if line[0]=='/':
            data.append(data_i)
        elif line[0]!='#':
            data_i.append(line.split()[1])
    f.close()
    return data

def get_HC(file):
    data = read_data(file)
    all_Q_codes = []
    for i in range(len(data)):
        Q_codes = []
        for j in range(len(data[i])):
            Q_codes.append(binary_coding(data[i][j])[5])
        all_Q_codes.append(Q_codes)
    return all_Q_codes

In [39]:
get_HC('part_of_data.txt')

[[['VV'],
  ['VV'],
  ['M', 'V'],
  ['M', 'MD', 'VV'],
  ['V', 'VV'],
  ['VV', 'M'],
  ['V', 'VV'],
  ['VV'],
  ['M'],
  ['M', 'DMV'],
  ['V'],
  ['DD', 'VV'],
  ['VV'],
  ['MD', 'VV'],
  ['VV'],
  ['UUMDM', 'VV'],
  ['M'],
  ['VV'],
  ['VV'],
  ['V'],
  ['VV'],
  ['M', 'D'],
  ['VV'],
  ['M', 'VV'],
  ['M'],
  ['M'],
  ['M', 'D'],
  ['VV'],
  ['MDU'],
  ['MDU'],
  ['M', 'D'],
  ['MV'],
  ['M', 'VV'],
  ['M'],
  ['V', 'V'],
  ['M', 'DM', 'VVVU'],
  ['M'],
  ['VV'],
  ['MV', 'V'],
  ['M'],
  ['VV'],
  ['VV'],
  ['M', 'VVD'],
  ['M', 'V'],
  ['M', 'VU'],
  ['VV'],
  ['M'],
  ['VV'],
  ['M', 'V'],
  ['M'],
  ['M'],
  ['V', 'V'],
  ['M'],
  ['M'],
  ['MV'],
  ['M'],
  ['M'],
  ['M'],
  ['MV'],
  ['M', 'VVD'],
  ['M'],
  ['MV'],
  ['M'],
  ['VV'],
  ['VV'],
  ['M', 'V'],
  ['VVV'],
  ['M'],
  ['M'],
  ['MV', 'VV'],
  ['M'],
  ['M', 'V'],
  ['M', 'UM'],
  ['VV'],
  ['M'],
  ['M'],
  ['M'],
  ['M', 'V'],
  ['M', 'V'],
  ['M', 'D'],
  ['M', 'V'],
  ['MDU', 'D'],
  ['MDU'],
  ['M', 'D'],
  ['M'

In [36]:
seq1, seq2, boundaries, mat, structs, Q_code = binary_coding("AIQTSDEH...KV..ATPANW....RP..................GDK.V.VVP.PP.NTQE.MA..EERMKEG")

In [12]:
for line in mat:
    print(str(line))

['-' 'A' '-' '-' '-' '-' '-' '-' '-' '-' 'M']
['-' 'I' '-' 'V' '-' '-' 'V' '-' '-' '-' 'K']
['-' '-' '-' '-' '-' '-' 'V' '-' '-' '-' 'E']
['-' '-' '-' '-' 'W' '-' 'V' '-' 'M' '-' '⯁']
['A' '-' '-' '-' '-' '-' '-' '-' '-' 'M' '-']
['I' '-' 'V' '-' '-' 'V' '-' '-' '-' 'K' '-']
['-' '-' '-' '-' '-' 'V' '-' '-' '-' 'E' '-']
['-' '-' '-' 'W' '-' 'V' '-' 'M' '-' '⯁' '-']


In [13]:
seq2

'0100000001000001000001110000000100001000'

In [38]:
structs

['01', '1', '1', '111', '1']

In [37]:
Q_code

['VV']