In [1]:
import pandas as pd
import glob, os, ast, re

In [2]:
def preprocess_ent_data(file):
    # Extract the uniprot identifier from the filename
    uniprot = file.split('/')[-1].split('_')[0]
    
    # Initialize data storage
    data = []
    current_entry = []
    
    with open(file, 'r') as f:
        combined_ent = []
        for line in f:
            if line.startswith('Chain'):
                # Process the current entry if it exists
                if current_entry:
                    parts = ' '.join(current_entry).strip().split('|')
                    chain = parts[0].strip()
                    entanglement_info = ast.literal_eval(parts[1].strip().replace("'", '"'))
                    i,j,crossing = entanglement_info
                    crossing_fixed = re.findall(r'[+-]?\d+', crossing[0])
                    GN = round(float(parts[2].strip()), 3)
                    GC = round(float(parts[3].strip()), 3)
                    ent_info = [tuple((i,j, crossing_fixed)), (GN, GC)]
                    combined_ent.append(ent_info)
                
                # Start a new entry
                current_entry = [line]
            else:
                # Continue accumulating the current entry
                current_entry.append(line)
        
        # Process the last entry if it exists
        if current_entry:
            parts = ' '.join(current_entry).strip().split('|')
            chain = parts[0].strip()
            entanglement_info = ast.literal_eval(parts[1].strip().replace("'", '"'))
            i,j,crossing = entanglement_info
            crossing_fixed = re.findall(r'[+-]?\d+', crossing[0])

            # print(parts[1])
            # print(entanglement_info)
            GN = round(float(parts[2].strip()), 3)
            GC = round(float(parts[3].strip()), 3)
            ent_info = [tuple((i,j, crossing_fixed)), (GN, GC)]
            combined_ent.append(ent_info)
        
        # Append the final combined entry to data
        data.append([uniprot, combined_ent])
    
    return pd.DataFrame(data, columns=['uniprot', 'whGLN'])

In [3]:
folder = 'GE_data/'
file_paths = glob.glob(f"{folder}/*.txt")
dataframes = []
for idx, file in enumerate(file_paths):
    print(idx, file)
    res = preprocess_ent_data(file)
    dataframes.append(res)

combined_df = pd.concat(dataframes, ignore_index=True)

0 GE_data/A0A024RBG1_v4_GE.txt
1 GE_data/A0A075B6I1_v4_GE.txt
2 GE_data/A0A075B6J1_v4_GE.txt
3 GE_data/A0A075B6J2_v4_GE.txt
4 GE_data/A0A075B6J6_v4_GE.txt
5 GE_data/A0A075B6K2_v4_GE.txt
6 GE_data/A0A075B6L2_v4_GE.txt
7 GE_data/A0A075B6L6_v4_GE.txt
8 GE_data/A0A075B6N1_v4_GE.txt
9 GE_data/A0A075B6N2_v4_GE.txt
10 GE_data/A0A075B6N3_v4_GE.txt
11 GE_data/A0A075B6N4_v4_GE.txt
12 GE_data/A0A075B6Q5_v4_GE.txt
13 GE_data/A0A075B6R0_v4_GE.txt
14 GE_data/A0A075B6R2_v4_GE.txt
15 GE_data/A0A075B6S5_v4_GE.txt
16 GE_data/A0A075B6S9_v4_GE.txt
17 GE_data/A0A075B6T6_v4_GE.txt
18 GE_data/A0A075B6T7_v4_GE.txt
19 GE_data/A0A075B6U4_v4_GE.txt
20 GE_data/A0A075B6V5_v4_GE.txt
21 GE_data/A0A075B6W5_v4_GE.txt
22 GE_data/A0A075B759_v4_GE.txt
23 GE_data/A0A075B767_v4_GE.txt
24 GE_data/A0A087WSY4_v4_GE.txt
25 GE_data/A0A087WSY6_v4_GE.txt
26 GE_data/A0A087WSZ0_v4_GE.txt
27 GE_data/A0A087WT01_v4_GE.txt
28 GE_data/A0A087WT02_v4_GE.txt
29 GE_data/A0A087WV62_v4_GE.txt
30 GE_data/A0A087WVF3_v4_GE.txt
31 GE_data/A0A087W

In [4]:
combined_df

Unnamed: 0,uniprot,whGLN
0,A0A024RBG1,"[[(1, 40, ['-48']), (0.0, -0.806)], [(2, 40, [..."
1,A0A075B6I1,"[[(70, 116, ['-54']), (-0.697, 0.0)]]"
2,A0A075B6J1,"[[(72, 120, ['-54']), (-0.701, 0.0)]]"
3,A0A075B6J2,"[[(65, 108, ['-57']), (-0.697, 0.0)]]"
4,A0A075B6J6,"[[(67, 115, ['-52']), (-0.85, 0.0)], [(68, 114..."
...,...,...
11268,Q9Y6Y9,"[[(89, 136, ['-77']), (-0.628, -0.216)]]"
11269,Q9Y6Z7,"[[(150, 198, ['-269']), (0.047, -0.686)], [(15..."
11270,Q9YNA8,"[[(58, 376, ['-421']), (0.085, -0.739)], [(97,..."
11271,U3KPV4,"[[(30, 227, ['+307']), (0.051, 0.91)], [(30, 2..."


In [5]:
combined_df.to_pickle('AF_Human_whGLN.pkl')