In [11]:
from typing import List
from tools.Dict import Dict as UDict
from copy import deepcopy
from itertools import repeat

import re
tokenize = re.compile("[\w\.,]+")

class Preprocessing:
    @staticmethod
    def getColumnsAndIndex(row):
        keys = tokenize.findall(row)
        res = []
        status = False
        begin = -1
        for i,ch in enumerate(row):
            if status:
                if ch!=" ":
                    continue
                else:
                    status = False
                    res.append(UDict(inf=begin, sup=(i-1) ) )
            else:
                if ch!=" ":
                    status= True
                    begin = i
                else:
                    continue
        return res
    @staticmethod
    def resolveSpace(x):
        if x.startswith(" "):
            for i,ch in enumerate(x):
                if ch!=' ':
                    return x[i:]
            return None
        return x

class Parser:
    def __init__(self, colNamePart:str, backend:str ):
        self.colNamePart = colNamePart 
        self.backend = backend
    def explain(self,article:str) -> List[List[str]]:
        
        if not hasattr(self,'res'):
            lines   :List[str]   = article.splitlines()
            part    :     str    = self.colNamePart
            colNames:     str    = [line for line in lines if part in line][0]
            self.res:List[UDict] = Preprocessing.getColumnsAndIndex(colNames)
            self.columns_row:str = colNames
                
        def cellunit(line, tups):
            tupLast, tup = tups
            if line[tup.inf]==' ':
                return line[tup.inf:tup.sup+1]
            return Preprocessing.resolveSpace(line[tupLast.sup+1:tup.sup+1])
        columns_row : str    = self.columns_row
        
        source  :List[str]   = lines[lines.index(columns_row):]
        tupsS = list(zip(self.res[:-1],self.res[1:]))
        retIter = map(lambda line: map(cellunit, repeat(line), tupsS),  source)
        return  [list(row) for row in retIter]
        

In [12]:
import pandas as pd
backend_sign = dict(dssp = "  #  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O ", 
               # part of the row which defines the attributes of our datas.
              )
               
def bio_parse(filename, backend='dssp'):
    with open(filename) as toRead:
        string = toRead.read()
        parser = Parser(colNamePart=backend_sign[backend], backend=backend)
        results = parser.explain(string)
        return pd.DataFrame(results[1:], columns = tuple(results[0]))
    raise SyntaxError("The backend is not {backend}".format(backend = backend))
    
    
# Test on a DSSP file
if __name__ == '__main__':
    
    with open(r"1a00.dssp") as f:
        strs = f.read()
    parser = Parser(colNamePart = row, backend='dssp')
    results = parser.explain(strs)

In [13]:
if __name__ == '__main__':
    import pandas as pd
    df = bio_parse('./dssp/1a00.dssp')
    print(df)

     RESIDUE  AA  STRUCTURE  BP1  BP2  ACC    N-H-->O    O-->H-N    N-H-->O  \
0        1 A  V                0    0  132     0, 0.0     2,-0.4     0, 0.0   
1        2 A  L         -      0    0   31    71,-0.1   122,-0.0   125,-0.1   
2        3 A  S      >  -      0    0   45    -2,-0.4     4,-2.2     1,-0.1   
3        4 A  P   H  > S+      0    0   93     0, 0.0     4,-2.5     0, 0.0   
4        5 A  A   H  > S+      0    0   55     1,-0.2     4,-2.8     2,-0.2   
5        6 A  D   H  > S+      0    0   16     2,-0.2     4,-2.7     1,-0.2   
6        7 A  K   H  X S+      0    0   51    -4,-2.2     4,-2.1     1,-0.2   
7        8 A  T   H  X S+      0    0   94    -4,-2.5     4,-2.1     2,-0.2   
8        9 A  N   H  X S+      0    0   38    -4,-2.8     4,-2.4     1,-0.2   
9       10 A  V   H  X S+      0    0    1    -4,-2.7     4,-2.4     2,-0.2   
10      11 A  K   H  X S+      0    0   97    -4,-2.1     4,-1.1     1,-0.2   
11      12 A  A   H  X S+      0    0   57    -4,-2.

In [14]:
if __name__ == '__main__':
    print(df.columns)

Index(['RESIDUE', 'AA', 'STRUCTURE', 'BP1', 'BP2', 'ACC', 'N-H-->O', 'O-->H-N',
       'N-H-->O', 'O-->H-N', 'TCO', 'KAPPA', 'ALPHA', 'PHI', 'PSI', 'X-CA',
       'Y-CA', 'Z-CA'],
      dtype='object')


In [15]:
if __name__ == '__main__':
    print( (len(set(df.AA)), len(set(df.STRUCTURE))) )

(20, 71)
