In [1]:
from typing import List
from tools.Dict import Dict as UDict
from copy import deepcopy
from itertools import repeat

import re
tokenize = re.compile("[\w\.,]+")

class Preprocessing:
    @staticmethod
    def getColumnsAndIndex(row):
        keys = tokenize.findall(row)
        res = []
        status = False
        begin = -1
        for i,ch in enumerate(row):
            if status:
                if ch!=" ":
                    continue
                else:
                    status = False
                    res.append(UDict(inf=begin, sup=(i-1) ) )
            else:
                if ch!=" ":
                    status= True
                    begin = i
                else:
                    continue
        return res

    def resolveSpace(x):
        if x.startswith(" "):
            for i,ch in enumerate(x):
                if ch!=' ':
                    return x[i:]
            return None
        return x

class Parser:
    def __init__(self, colNamePart:str ,backend:str ):
        self.colNamePart = colNamePart 
        self.backend = backend
    def explain(self,article:str) -> List[List[str]]:
        
        if not hasattr(self,'res'):
            lines   :List[str]   = article.splitlines()
            part    :     str    = self.colNamePart
            colNames:     str    = [line for line in lines if part in line][0]
            self.res:List[UDict] = Preprocessing.getColumnsAndIndex(colNames)
            self.columns_row:str = colNames
                
        def cellunit(line, tups):
            tupLast, tup = tups
            if line[tup.inf]==' ':
                return line[tup.inf:tup.sup+1]
            return Preprocessing.resolveSpace(line[tupLast.sup+1:tup.sup+1])
        columns_row : str    = self.columns_row
        
        source  :List[str]   = lines[lines.index(columns_row):]
        tupsS = list(zip(self.res[:-1],self.res[1:]))
        retIter = map(lambda line: map(cellunit, repeat(line), tupsS),  source)
        return  [list(row) for row in retIter]
        

In [10]:
# Test on a DSSP file
row = "  #  RESIDUE AA STRUCTURE BP1 BP2  ACC     N-H-->O "  # part of the row which defines the attributes of our datas.
with open(r"1a00.dssp") as f:
    strs = f.read()
parser = Parser(colNamePart = row, backend='dssp')
results = parser.explain(strs)

In [11]:
import pandas as pd
df = pd.DataFrame(results[1:],columns = tuple(results[0]))

In [5]:
df

Unnamed: 0,RESIDUE,AA,STRUCTURE,BP1,BP2,ACC,N-H-->O,O-->H-N,N-H-->O.1,O-->H-N.1,TCO,KAPPA,ALPHA,PHI,PSI,X-CA,Y-CA
0,1 A,V,,0,0,132,"0, 0.0","2,-0.4","0, 0.0","127,-0.1",0.00,0 360.0,360.0,360.,0 115.,3 103.1,38.5
1,2 A,L,-,0,0,31,"71,-0.1","122,-0.0","125,-0.1","0, 0.0",-0.56,3 360.0,-152.6,-76.,5 125.,6 104.3,39.3
2,3 A,S,> -,0,0,45,"-2,-0.4","4,-2.2","1,-0.1","5,-0.1",-0.33,34.2,-102.3,-83.,0 172.,9 106.4,36.6
3,4 A,P,H > S+,0,0,93,"0, 0.0","4,-2.5","0, 0.0","5,-0.1",0.87,2 125.4,56.9,-64.,8 -37.,0 108.9,37.5
4,5 A,A,H > S+,0,0,55,"1,-0.2","4,-2.8","2,-0.2","5,-0.2",0.91,8 106.9,48.3,-59.,3 -44.,4 106.4,36.4
5,6 A,D,H > S+,0,0,16,"2,-0.2","4,-2.7","1,-0.2","-1,-0.2",0.91,6 110.0,50.2,-60.,7 -48.,8 103.9,38.8
6,7 A,K,H X S+,0,0,51,"-4,-2.2","4,-2.1","1,-0.2","-1,-0.2",0.88,4 111.4,50.5,-58.,8 -40.,6 106.3,41.7
7,8 A,T,H X S+,0,0,94,"-4,-2.5","4,-2.1","2,-0.2","-2,-0.2",0.91,4 110.1,49.8,-56.,7 -51.,9 107.2,40.8
8,9 A,N,H X S+,0,0,38,"-4,-2.8","4,-2.4","1,-0.2","5,-0.2",0.91,9 112.5,46.9,-55.,2 -49.,9 103.5,40.7
9,10 A,V,H X S+,0,0,1,"-4,-2.7","4,-2.4","2,-0.2","5,-0.2",0.86,5 110.0,50.3,-62.,4 -46.,3 102.8,44.1


In [8]:
results[2]

['2 A',
 'L ',
 '-  ',
 '0',
 '0',
 '31',
 '71,-0.1',
 '122,-0.0',
 '125,-0.1',
 '0, 0.0',
 '-0.56',
 '3 360.0',
 '-152.6',
 '-76.',
 '5 125.',
 '6  104.3',
 '39.3']