In [81]:
import pandas as pd
import numpy as np
import os

Placeholder Values: Use a specific placeholder value to indicate missing data. For dihedral angles, which typically range between -180 and 180 degrees, you might use an out-of-range value like -999 as a placeholder. For HSE, which are typically non-negative, a negative placeholder could be used. The neural network can potentially learn to interpret these values appropriately.


In [82]:
def get_protein_names(file_name_lists):
    # read the name of the proteins
    with open(file_name_lists, 'r') as file:
        prot_names = [line.strip() for line in file]
    # end with
    # print(prot_names)
    return prot_names
# end def

In [83]:
def get_dssp_info(dssp_file_name):
    """
    dssp_file_name includes file path and extension
    """
    with open(dssp_file_name, 'r') as file:
        lines = file.readlines()
    # end with

    # Extracting the relevant lines
    amino_acid_code = lines[1].strip()
    ss3 = lines[2].strip()
    phi_angles = lines[3].strip().split()
    psi_angles = lines[4].strip().split()
    asa = lines[5].strip().split()

    # Creating the dataframe
    dssp = pd.DataFrame({
        'AA CODE': list(amino_acid_code),
        'SS3': list(ss3),
        'PHI': phi_angles,
        'PSI': psi_angles,
        'ASA': asa
    })

    # this converts to float numbers and accounts for 'X'
    # converts missing 'X' to Nan
    dssp['PHI'] = pd.to_numeric(dssp['PHI'], errors='coerce')
    dssp['PSI'] = pd.to_numeric(dssp['PSI'], errors='coerce')
    dssp['ASA'] = pd.to_numeric(dssp['ASA'], errors='coerce') 
    return dssp
# end def

In [84]:
def get_theta_info(theta_file_name):
    """
    theta_file_name includes file path and extension
    """
    columns = ['RES NUM', 'AA CODE', 'THETA', 'TAU', 'OMEGA']
    # creates data frame from file
    theta = pd.read_csv(theta_file_name, sep=' ', names=columns)

    # this converts to float or ints and accounts for 'X' to NaN
    theta['THETA']  = pd.to_numeric(theta['THETA'], errors='coerce')
    theta['TAU']    = pd.to_numeric(theta['TAU'], errors='coerce')
    theta['OMEGA']  = pd.to_numeric(theta['OMEGA'], errors='coerce')

    return theta
# end def

In [85]:
def get_hse_info(hse_file_name, CASP=False):
    """
    hse_file_name includes file path and extension
    """

    # Read the first line of the file to determine the number of columns
    with open(hse_file_name, 'r') as file:
        first_line = file.readline()
        num_columns = len(first_line.split())

    # Define the columns based on the number of columns in the file
    if num_columns == 6:
        columns = ['AA NAME', 'RES NUM', 'AA CODE', 
                   'HSE TOTAL', 'HSE UP', 'HSE DOWN']
    elif num_columns == 7:
        columns = ['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE', 
                   'HSE TOTAL', 'HSE UP', 'HSE DOWN']
    else:
        raise ValueError(f"Unexpected number of columns: {num_columns}")

    # Attempt to read the file
    hse = pd.read_csv(hse_file_name, sep=r'\s+', names=columns)

    # this converts to float or ints and accounts for 'X' to NaN
    if 'CHAIN ID' not in hse.columns:
        hse['CHAIN ID'] = 'A'
        # hse['CHAIN ID'] = pd.to_numeric(hse['CHAIN ID'], errors='coerce')
    # end if
    hse['RES NUM'] = pd.to_numeric(hse['RES NUM'], errors='coerce')
    hse['HSE TOTAL'] = pd.to_numeric(hse['HSE TOTAL'], errors='coerce')
    hse['HSE UP'] = pd.to_numeric(hse['HSE UP'], errors='coerce')
    hse['HSE DOWN'] = pd.to_numeric(hse['HSE DOWN'], errors='coerce')

    return hse
# end def

In [86]:
def convert_data_to_numpy(file_name_lists, data_folder='spot_1d_lm', lists_folder = 'lists', dssp_folder = 'dssp', hse_folder = 'hse', theta_folder = 'theta', numpy_folder = 'labels'):
    """
    file_name_lists does not include paths
    only file name and extension
    """
    dssp_ext = '.dssp'
    hse_ext = '.h'
    theta_ext = '.t'
    numpy_ext = '.npy'
    file_name_lists = os.path.join(os.getcwd(), data_folder, lists_folder, file_name_lists)
    protein_names = get_protein_names(file_name_lists)
    
    for protein in protein_names:
        dssp_file_name = protein + dssp_ext
        dssp_file_name = os.path.join(os.getcwd(), data_folder, dssp_folder, dssp_file_name)

        hse_file_name = protein + hse_ext
        hse_file_name = os.path.join(os.getcwd(), data_folder, hse_folder, hse_file_name)

        theta_file_name = protein + theta_ext
        theta_file_name = os.path.join(os.getcwd(), data_folder, theta_folder, theta_file_name)
        # print(protein)
        dssp = get_dssp_info(dssp_file_name)
        hse  = get_hse_info(hse_file_name)
        theta = get_theta_info(theta_file_name)

        hse_dssp = pd.merge(hse, dssp, 
                            how='inner', 
                            suffixes=('', '_remove'),
                            left_index=True,
                            right_index=True)

        hse_dssp.drop([i for i in hse_dssp.columns if 'remove' in i], axis=1, inplace=True)

        protein_data = pd.merge(hse_dssp, theta,
                                how='inner', 
                                suffixes=('', '_remove'),
                                left_index=True,
                                right_index=True)
        
        protein_data.drop([i for i in protein_data.columns if 'remove' in i], axis=1, inplace=True)

        # Reorder columns
        desired_order = ['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE', 'SS3', 'ASA', 'HSE TOTAL', 'HSE UP', 'HSE DOWN', 'PHI', 'PSI', 'THETA', 'TAU', 'OMEGA']
        protein_data = protein_data[desired_order]

        protein_data_file_name = os.path.join(os.getcwd(), data_folder, numpy_folder, protein + numpy_ext)
        np.save(protein_data_file_name, protein_data.to_numpy())
        # protein_data.to_csv(protein_data_file_name)
    # end for
    # return protein_data, dssp
# end def


In [87]:
# convert_data_to_numpy("val.txt")
# convert_data_to_numpy("train.txt")

In [88]:
len(['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE', 'SS3', 'ASA', 'HSE TOTAL', 'HSE UP', 'HSE DOWN', 'PHI', 'PSI', 'THETA', 'TAU', 'OMEGA'])

14

In [90]:
convert_data_to_numpy("casp12.txt")
convert_data_to_numpy("casp13.txt")
convert_data_to_numpy("casp14.txt")

In [None]:
convert_data_to_numpy("TEST2020-HQ.txt")
convert_data_to_numpy("TEST2018.txt")
convert_data_to_numpy("Neff1-2020.txt")

# parent = os.getcwd() # '/mnt/nvme/home/bbabatun/IDL/PROJECT/SPOT-1D-LM'

data_folder = 'spot_1d_lm'
lists_folder = 'lists'
dssp_folder = 'dssp'
hse_folder = 'hse'
theta_folder = 'theta'
csv_folder = 'csv'

dssp_ext = '.dssp'
hse_ext = '.h'
theta_ext = '.t'

file_name_lists = "train.txt"

file_name_lists = os.path.join(os.getcwd(), data_folder, lists_folder, file_name_lists)

prot_names = get_protein_names(file_name_lists)

In [None]:
"""
.DSSP FILE
PROTEIN NAME
AA CODE
SS3
PHI
PSI
ASA

.T FILE
RES NUM, AA CODE, THETA, TAU, OMEGA

.H FILE
AA NAME, CHAIN ID, RES NUM, AA CODE, HSE TOTAL, HSE UP, HSE DOWN
"""
# ['AA NAME', 'CHAIN ID', 'RES NUM', 'AA CODE', 'SS3', 'ASA', 'HSE TOTAL', 'HSE UP', 'HSE DOWN', 'PHI', 'PSI', 'THETA', 'TAU', 'OMEGA']