In [1]:
import re
from io import StringIO
import pandas as pd

In [2]:
def parse_x1(num, header=['Center Number', 'Atomic Number', 'Atomic Type', 'X', 'Y', 'Z']):
    out = open("../data/raw/outs/" + str(num) + ".out", "r")
    outtxt = out.read()
    so = re.finditer('(S|s)tandard orientation', outtxt)
    for s in so:
        pass
    last_so = s.end()
    delim = re.finditer('---------------------------------------------------------------------', outtxt[last_so:])
    
    i = 0
    for d in delim:
        if i == 1:
            first_delim_end = d.end()
        if i == 2:
            second_delim_start = d.start()
        i += 1
        
    return pd.read_csv(StringIO(outtxt[last_so+first_delim_end:last_so+second_delim_start]), delim_whitespace=True, header=None, names=header)
    

In [3]:
def parse_x2_occ(num):
    out = open("../data/raw/outs/" + str(num) + ".out", "r")
    outtxt = out.read()
    alpha_occ = re.finditer('(A|a)lpha  occ. eigenvalues --', outtxt)

    all_rows = []
    rows = []
    for occ in alpha_occ:
        l = outtxt[occ.end():occ.end()+51]
        match = re.search('A', l)
        if match is not None:
            print(match)
            rows.append(list(filter(lambda a: a != '', re.split('\s', outtxt[occ.end():occ.end()+match.start()-2]))))
            all_rows.append(rows)
            rows = []
        else:
            rows.append(list(filter(lambda a: a != '', re.split('\s', l))))
            
    return pd.DataFrame(all_rows[len(all_rows)-1])

In [4]:
def parse_x2_virt(num):
    out = open("../data/raw/outs/" + str(num) + ".out", "r")
    outtxt = out.read()
    alpha_occ = re.finditer('(A|a)lpha virt. eigenvalues --', outtxt)

    all_rows = []
    rows = []
    le = 0
    occ = next(alpha_occ, None)
    while occ is not None:
        le = occ.end()
        l = outtxt[occ.end():occ.end()+51]
        occ = next(alpha_occ, None)
        if occ is None or occ.start() - le > 55:
            rows.append(list(filter(lambda a: a != '' and a != 'Condensed' and a != 'to', re.split('\s', l))))
            all_rows.append(rows)
            rows = []
        else:
            rows.append(list(filter(lambda a: a != '', re.split('\s', l))))
            
    return pd.DataFrame(all_rows[len(all_rows)-1])

In [None]:
def parse_x3(num, header=["Atom Number", "Atom"]):
    # Condensed to atoms (all electrons)

    with open("../data/raw/outs/" + str(num) + ".out", "r") as f:
        outtxt = f.read()

    # Electric Field Gradient Eigenvalues
    header_expression = re.finditer(r"          Condensed to atoms \(all electrons\):", outtxt)
    
    # Use the second occurance in the file.
    header_expression = list(header_expression)[1]

    header_end = int(header_expression.end())

    # Find leading and closing barrier
    leading_pos = outtxt.find("\r\n", header_end)
    closing_pos = outtxt.find(" Mulliken charges:", leading_pos)

    # Split into separate lines
    lines = outtxt[leading_pos+2:closing_pos].split("\n")[1:]

    # Create each dataframe
    dataframes = []
    col_start = 1
    data_line_start = 0
    for idx, line in enumerate(lines):
        if line.startswith("              ") or idx == len(lines)-1:
            # We've reached an ending point
            df = pd.read_csv(StringIO(str("\n".join(lines[data_line_start:idx])).decode('utf-8')), delim_whitespace=True, header=None, names=header + [str(i) for i in range(col_start,col_start+6)])

            # Remove duplicate Atom & Atom Number columns
            del df["Atom"]
            del df["Atom Number"]
            
            dataframes.append(df)

            # Adjust starting column
            col_start +=6

            # Adjust starting block
            data_line_start = idx + 1

    return pd.concat(dataframes, axis= 1)

In [None]:
def parse_x4(num, header=["Atom Number", "Electric Potential", "Val1", "Val2", "Val3"]):
    # Electrostatic properties using the SCF density: atom centers

    with open("../data/raw/outs/" + str(num) + ".out", "r") as f:
        outtxt = f.read()

    # Electric Field Gradient Eigenvalues
    header_expression = re.finditer(r"            Electrostatic Properties Using The SCF Density", outtxt)
    
    # Use the second occurance in the file.
    header_expression = list(header_expression)[1]

    header_end = int(header_expression.end())

    # Find leading and closing barrier
    barrier_str = " **********************************************************************\r\n"
    leading_pos = outtxt.find(barrier_str, header_end)
    closing_pos = outtxt.find(" -----------------------------------------------------------------", leading_pos + len(barrier_str))
    return pd.read_csv(StringIO(str(outtxt[leading_pos+len(barrier_str)+2:closing_pos-1]).decode('utf-8')), delim_whitespace=True, header=None, names=header)

In [None]:
def parse_x5(num, header=["Atom Number", "Electric Potential", "X", "Y", "Z"]):
    # Electrostatic properties (atomic units)

    with open("../data/raw/outs/" + str(num) + ".out", "r") as f:
        outtxt = f.read()

    # Electric Field Gradient Eigenvalues
    header_expression = re.finditer(r"    Center     Electric         -------- Electric Field --------\r\n               Potential          X             Y             Z", outtxt)
    
    # Use the second occurance in the file.
    header_expression = list(header_expression)[1]

    header_end = int(header_expression.end())

    # Find leading and closing barrier
    barrier_str = " -----------------------------------------------------------------"
    leading_pos = outtxt.find(barrier_str, header_end)
    closing_pos = outtxt.find(barrier_str, leading_pos + len(barrier_str))
    return pd.read_csv(StringIO(str(outtxt[leading_pos+len(barrier_str)+2:closing_pos-1]).decode('utf-8')), delim_whitespace=True, header=None, names=header)

In [None]:
def parse_x6(num):
    # Electric Field Gradient, Coordinates

    with open("../data/raw/outs/" + str(num) + ".out", "r") as f:
        outtxt = f.read()

    # Electric Field Gradient Eigenvalues
    results = []
    for group in ["XX            YY            ZZ", "XY            XZ            YZ"]:
        header_expression = re.finditer(r"    Center         ---- Electric Field Gradient ----\r\n                     " + group, outtxt)
        
        # Use the second occurance in the file.
        header_expression = list(header_expression)[1]

        # Get the ending position
        header_end = int(header_expression.end())

        # Find leading and closing barrier
        barrier_str = " -----------------------------------------------------"
        leading_pos = outtxt.find(barrier_str, header_end)
        closing_pos = outtxt.find(barrier_str, leading_pos + len(barrier_str))

        # Append to results
        results.append(outtxt[leading_pos+len(barrier_str)+2:closing_pos-1])

    # Prepare Data Frames
    df1 = pd.read_csv(StringIO(str(results[0]).decode('utf-8')), delim_whitespace=True, header=None, names=["Atom Number", "XX", "YY", "ZZ"])
    df2 = pd.read_csv(StringIO(str(results[1]).decode('utf-8')), delim_whitespace=True, header=None, names=["Atom Number", "XY", "XZ", "YZ"])
    del df2["Atom Number"]

    # Return vertically concatenated dataframes
    return pd.concat([df1, df2], axis= 1)

In [None]:
def parse_x7(num, header=["Atom Number", "Eigen 1", "Eigen 2", "Eigen 3"]):
    # Electric Field Gradient, Eigenvalues

    with open("../data/raw/outs/" + str(num) + ".out", "r") as f:
        outtxt = f.read()

    # Electric Field Gradient Eigenvalues
    header_expression = re.search(r"    Center         ---- Electric Field Gradient ----\r\n                   ----       Eigenvalues       ----", outtxt)
    header_end = int(header_expression.end())

    # Find leading and closing barrier
    barrier_str = " -----------------------------------------------------"
    leading_pos = outtxt.find(barrier_str, header_end)
    closing_pos = outtxt.find(barrier_str, leading_pos + len(barrier_str))

    return pd.read_csv(StringIO(str(outtxt[leading_pos+len(barrier_str)+2:closing_pos-1]).decode('utf-8')), delim_whitespace=True, header=None, names=header)

In [None]:
def parse_x8(num):
    # total SCF density

    with open("../data/raw/fchks/Anth_" + str(num) + ".fch", "r") as f:
        outtxt = f.read()

    # Using a regular expression, find the number of data elements.
    header_expression = re.search(r"Total SCF Density                          R   N=       (\d+)", outtxt)
    num_elems = int(header_expression.group(1))
    header_end = int(header_expression.end())

    # Extract all the data elements
    values = re.split('\s{1,}|\n', outtxt[header_end:])[1:num_elems+1]
    
    # Verify that each value is numeric.
    for val in values:
        try:
            float(val)
        except ValueError:
            raise ValueError('A non-numeric value has been processed.')

    return pd.DataFrame(values)

In [None]:
def parse_x9(num):
    # Alpha MO coefficients

    with open("../data/raw/fchks/Anth_" + str(num) + ".fch", "r") as f:
        outtxt = f.read()

    # Using a regular expression, find the number of data elements.
    header_expression = re.search(r"Alpha MO coefficients                      R   N=       (\d+)", outtxt)
    num_elems = int(header_expression.group(1))
    header_end = int(header_expression.end())

    # Extract all the data elements
    values = re.split('\s{1,}|\n', outtxt[header_end:])[1:num_elems+1]

    # Verify that each value is numeric.
    for val in values:
        try:
            float(val)
        except ValueError:
            raise ValueError('A non-numeric value has been processed.')

    return pd.DataFrame(values)