In [None]:
fields = ['ITERATION', 'ENERGY', '1e-ENERGY', '2e-ENERGY', 'NORM[dD(SAO)]', 'TOL',
          'Exc', 'N', 'Norm[diis error]']

from fortranformat import FortranRecordReader

FortranRecordReader('(I4,F18.11,F17.7,F17.7,E13.3,E10.2)').read(lines[4])

In [None]:
raw = '''
 
                                              current damping :  0.650
 ITERATION  ENERGY          1e-ENERGY        2e-ENERGY     NORM[dD(SAO)]  TOL
   5  -800.90831057908    -3344.4859460     1407.4390040    0.316D+00 0.167D-10
                            Exc =  -109.654430018853     N = 124.00120409    
          Norm of current diis error: 0.35473    
          max. resid. norm for Fia-block=  1.632D-02 for orbital     56a         
          max. resid. fock norm         =  5.640D-02 for orbital    722a         
          irrep a   : virtual orbitals shifted by    0.09398
 mo-orthogonalization: Cholesky decomposition
          Delta Eig. =    13.9367250550 eV 
 
'''

In [2]:
from io import StringIO

with open('data/aoforce.out') as f:
    raw = StringIO(f.read())

In [None]:
anchor_txt = '''

          ---------------------------------------------------
          NORMAL MODES and VIBRATIONAL FREQUENCIES (cm**(-1))
          ---------------------------------------------------

'''

In [19]:
from io import StringIO
from pyparsing import Word, nums, Literal, LineStart, LineEnd, OneOrMore
from fortranformat import FortranRecordReader
import pandas as pd

In [58]:
class FortranLineParser(object):
    def __init__(self, pattern, name=None,
                 skip=None, strip_whitespace=True, map_values=None,
                 after_read_hook=None):
        self._reader = FortranRecordReader(pattern)
        self.name = name
        self._skip = [skip] if isinstance(skip, int) else skip
        self._strip_whitespace = strip_whitespace
        self._map_values = map_values if isinstance(map_values, dict) else None
        self._after_read_hook = after_read_hook
    
    def __call__(self, line):
        data = self._reader.read(line)
        if self._skip:
            skip = self._skip
            data = [_ for i, _ in enumerate(data) if i not in skip]
        if self._strip_whitespace:
            data = [(_.strip() if isinstance(_, str) else _) for _ in data]
        if self._map_values:
            _map = self._map_values
            data = [(_map[_] if _ in _map else _) for _ in data]
        if self._after_read_hook:
            data = self._after_read_hook(data)
        return data

In [67]:
l = [1,2,3,4,5]
skip = [0,3]
l += skip
l

[1, 2, 3, 4, 5, 0, 3]

In [60]:
class BaseParser(object):
    def __init__(self, raw):
        self.raw = StringIO(raw)
    
    def _scan_forward(self, anchor, before_match=False):
        loc = self.raw.tell()
        scanner = anchor.scanString(self.raw.read())
        match, start, end = next(scanner)
        scanner.close()
        if before_match:
            self.raw.seek(loc + start)
        else:
            self.raw.seek(loc + end)
    
    def _next_content_line(self, skip=0):
        while True:
            line = self.raw.readline()
            if line is '':
                raise RuntimeError('EOF reached')
            if line.strip() is not '':
                if skip > 0:
                    skip -= 1
                else:
                    return line
    
    def _chunks(self, sequence, n):
        """Yield successive n-sized chunks from sequence."""
        for i in range(0, len(sequence), n):
            yield sequence[i:i+n]

In [68]:
from collections import defaultdict

class VibrSpectrum(BaseParser):
    _anchors = {
        'MAIN': LineStart() + Word('-') + Literal('NORMAL MODES and VIBRATIONAL FREQUENCIES (cm**(-1))') + Word('-') + LineEnd(),
        'MODE': LineStart() + Literal('mode') + OneOrMore(Word(nums)) + LineEnd(),
    }
    
    _parser = {
        'MODE': FortranLineParser('(A20,6I9)', skip=0),
        'FREQUENCY': FortranLineParser('(A20,6F9.2)', skip=0),
        'IR': FortranLineParser('(A20,6A9)', skip=0, map_values={'YES': True, '-': False}),
    }
    
    def __init__(self, raw, natoms):
        self.raw = StringIO(raw)
        self.natoms = natoms
        self.nmodes = natoms * 3
        self._data = None
        self._parse('_data')
        print(self._data)
    
    def _parse(self, datastore_key):
        NCOLS = 6
        self._scan_forward(VibrSpectrum._anchors['MAIN'])
        datastore = defaultdict(list)
        for chunk in self._chunks(range(self.nmodes), NCOLS):
            self._parse_block(chunk, datastore)
        self.__dict__[datastore_key] = pd.DataFrame(datastore)
    
    def _parse_block(self, mode_indices, datastore):
        self._scan_forward(VibrSpectrum._anchors['MODE'], before_match=True)
        line = self._next_content_line()
        datastore['MODE'] += self._parser['MODE'](line)
        line = self._next_content_line()
        datastore['FREQUENCY'] += self._parser['FREQUENCY'](line)
        line = self._next_content_line(skip=1)
        datastore['IR'] += self._parser['IR'](line)
        

In [69]:
raw.seek(0)
VibrSpectrum(raw.getvalue(), 24)

    FREQUENCY     IR  MODE
0        0.00  False     1
1        0.00  False     2
2        0.00  False     3
3        0.00  False     4
4        0.00  False     5
5        0.00  False     6
6       35.16   True     7
7       75.53   True     8
8      126.00   True     9
9      171.29   True    10
10     211.05   True    11
11     242.74   True    12
12     295.71   True    13
13     333.81   True    14
14     370.87   True    15
15     398.44   True    16
16     447.83   True    17
17     460.96   True    18
18     523.95   True    19
19     546.84   True    20
20     563.81   True    21
21     619.54   True    22
22     684.43   True    23
23     710.89   True    24
24     714.52   True    25
25     746.39   True    26
26     757.28   True    27
27     805.30   True    28
28     818.05   True    29
29     835.24   True    30
..        ...    ...   ...
42    1197.74   True    43
43    1214.75   True    44
44    1239.26   True    45
45    1282.01   True    46
46    1305.85   True    47
4

<__main__.VibrSpectrum at 0x7f41b3dba2b0>

In [None]:
sample = '''
adfasdfasdfasdfasdfasdf fasdf sdf dfasdf dfasd sdfasdf fasdf f asdfsadfdsf


       mode              61       62       63       64       65       66

     frequency        2944.50  2965.81  2987.14  3011.15  3098.67  3103.94

     symmetry            a        a        a        a        a        a   

        IR               YES      YES      YES      -        YES      YES
|dDIP/dQ|   (a.u.)     0.0044   0.0044   0.0040   0.0051   0.0013   0.0012
intensity (km/mol)      34.90    34.99    28.78    47.10     2.92     2.56
intensity (  %   )      17.49    17.53    14.42    23.60     1.46     1.28
 
       RAMAN             YES      YES      YES      YES      YES      YES

'''

raw_block = StringIO(sample)
lines = raw_block.readlines()

In [None]:
mode_anchor = LineStart() + Literal('mode') + Word(nums) * 6 + LineEnd()

In [None]:
mode_anchor.searchString(sample)

In [None]:
frequncy_line = FortranRecordReader('(A20,6F9.2)')
symmetry_line = FortranRecordReader('(A20,6A9)')
IR_line = FortranRecordReader('(A20,6A9)')

In [None]:
frequncy_line.read(lines[6])

In [None]:
import re

In [None]:
label = 'mode', pattern = '\d+', n = 6, converter = int

match = re.search(r'^\s+mode((\s+\d+){6})\s+$', sample, re.MULTILINE)
[int(mode_id) for mode_id in match.groups(0)[0].split()]

In [None]:
match = re.search(r'^\s+frequency((\s+[\d.]+){6})\s+$', sample, re.MULTILINE)
[float(freq) for freq in match.groups(0)[0].split()]

In [None]:
match = re.search(r'^\s+IR((\s+(YES)|(-)){6}\s*$)', sample, re.MULTILINE)
match
#[float(freq) for freq in match.groups(0)[0].split()]

In [None]:
from pyparsing import Word, nums, Literal, Or

In [None]:
Integer = Word(nums).setParseAction(lambda x: int(x[0]))
Float = Word(nums+'.')
YesNo = Or(['-', 'YES'])

In [None]:
ir_activity = Literal('IR') + (YesNo * 6)
mode_label = Literal('mode') + (Integer('is') * 3) + (Integer('iis') * 3)

In [None]:
res = ir_activity.scanString(sample)

In [None]:
next(res)