In [1]:
import pandas as pd
import glob
import os

In [3]:
def load_asd_data_with_pandas(data_path: str) -> pd.DataFrame:
    """
    Загружает все parquet файлы из папки asd в один pandas DataFrame

    Args:
        data_path: путь к папке с данными

    Returns:
        pd.DataFrame: объединенный DataFrame со всеми данными
    """
    # Получаем все parquet файлы из папки
    parquet_files = glob.glob(os.path.join(data_path, "part-*.parquet"))

    if not parquet_files:
        raise ValueError(f"Не найдено parquet файлов в папке {data_path}")

    print(f"Найдено {len(parquet_files)} parquet файлов")

    # Загружаем все файлы в список DataFrame'ов
    dataframes = []
    for file_path in parquet_files:
        df = pd.read_parquet(file_path)
        dataframes.append(df)

    # Объединяем все DataFrame'ы в один
    combined_df = pd.concat(dataframes, ignore_index=True)

    print(f"Общий размер данных: {combined_df.shape}")
    print(f"Колонки: {list(combined_df.columns)}")

    return combined_df

# Загружаем данные
agab_df = load_asd_data_with_pandas('../asd')

Найдено 20 parquet файлов
Общий размер данных: (1227083, 11)
Колонки: ['dataset', 'heavy_sequence', 'light_sequence', 'scfv', 'affinity_type', 'affinity', 'antigen_sequence', 'confidence', 'nanobody', 'metadata', 'processed_measurement']


In [52]:
def is_empty(x):
    return x is None or (isinstance(x, float) and np.isnan(x)) or (isinstance(x, str) and x.strip() == "")

# сколько scFv-строк уже имеют обе цепи
both_present = (~scfv_df["heavy_sequence"].apply(is_empty)) & (~scfv_df["light_sequence"].apply(is_empty))
only_heavy   = (~scfv_df["heavy_sequence"].apply(is_empty)) & ( scfv_df["light_sequence"].apply(is_empty))
only_light   = ( scfv_df["heavy_sequence"].apply(is_empty)) & (~scfv_df["light_sequence"].apply(is_empty))

print("scFv всего:", len(scfv_df))
print("оба домена уже раздельно:", both_present.sum())
print("только heavy заполнен:", only_heavy.sum())
print("только light заполнен:", only_light.sum())
print("оба пустые:", ((~both_present) & (~only_heavy) & (~only_light)).sum())

scFv всего: 132157
оба домена уже раздельно: 474
только heavy заполнен: 131682
только light заполнен: 1
оба пустые: 0


In [68]:
from anarci import anarci
from tqdm.notebook import tqdm

def split_scfv_domains(df):
    """
    Splits scFv sequences into Heavy and Light chains using ANARCI indices.
    """
    sequences = []
    indices = []
    orig_seqs = {}
    
    print("Preparing sequences...")
    for idx, row in tqdm(df.iterrows(), total=len(df), desc="Формируем список последовательностей"):
        seq = row['heavy_sequence']
        if pd.isna(seq) or len(seq) == 0:
             seq = row.get('light_sequence', '')
        
        if isinstance(seq, str) and len(seq) > 0:
            sequences.append((str(idx), seq))
            indices.append(idx)
            orig_seqs[idx] = seq
            
    print(f"Running ANARCI on {len(sequences)} sequences...")
    
    try:
        # returns (numbering, alignment_details, hit_tables)
        # res[1] is alignment_details
        _, alignment_details, _ = anarci(sequences, scheme="imgt", ncpu=4)
    except Exception as e:
        print(f"ANARCI execution failed: {e}")
        return df

    h_dict = {}
    l_dict = {}

    print("Processing ANARCI results...")
    for i, hits in tqdm(enumerate(alignment_details), total=len(alignment_details), desc="Обработка ANARCI результатов"):
        if not hits: 
            continue
            
        idx = indices[i]
        full_seq = orig_seqs[idx]
        
        for hit in hits:
            if not hit: continue

            chain_type = hit.get('chain_type', 'unknown')
            start = hit.get('query_start')
            end = hit.get('query_end')
            
            if start is not None and end is not None:
                # Slicing in python is [start:end+1] to include the last residue
                domain_seq = full_seq[start : end]
                
                if chain_type == 'H':
                    h_dict[idx] = domain_seq
                elif chain_type in ['K', 'L']:
                    l_dict[idx] = domain_seq
                
    # Assign back to dataframe
    df['scfv_heavy_chain'] = df.index.map(h_dict)
    df['scfv_light_chain'] = df.index.map(l_dict)
    
    found_h = df['scfv_heavy_chain'].notna().sum()
    found_l = df['scfv_light_chain'].notna().sum()
    print(f"Extraction complete. Found {found_h} Heavy chains and {found_l} Light chains.")
    
    return df

# Apply the function to the whole dataframe
scfv_df = split_scfv_domains(scfv_df.head(50).copy())

# Show results
scfv_df[['heavy_sequence', 'scfv_heavy_chain', 'scfv_light_chain']].head()

Preparing sequences...


Формируем список последовательностей:   0%|          | 0/50 [00:00<?, ?it/s]

Running ANARCI on 50 sequences...
Processing ANARCI results...


Обработка ANARCI результатов:   0%|          | 0/50 [00:00<?, ?it/s]

Extraction complete. Found 50 Heavy chains and 50 Light chains.


Unnamed: 0,heavy_sequence,scfv_heavy_chain,scfv_light_chain
24566,QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKGLE...,QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKGLE...,DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRL...
24567,QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKPLE...,QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKPLE...,DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRL...
24568,QVQLVESGGGVVQPGESLKISCAASGFTFSSKGMHWVRQAPGKGYE...,QVQLVESGGGVVQPGESLKISCAASGFTFSSKGMHWVRQAPGKGYE...,DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRL...
24569,QVQLVESGGGVVQPGESLKISCAASGFTFSSMGMHWVRQAPGCGLE...,QVQLVESGGGVVQPGESLKISCAASGFTFSSMGMHWVRQAPGCGLE...,DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRL...
24570,QVQLVESGGGVVQPGESLKISCAASGFTFSSNGMHWVRQAPGKGPE...,QVQLVESGGGVVQPGESLKISCAASGFTFSSNGMHWVRQAPGKGPE...,DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRL...


In [69]:
scfv_df.iloc[0]['heavy_sequence']

'QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKGLEWVAAISGSGGSTFYADSVKGRFTISRDNAHSNLYLEMQSLRAEDTAVYYCARSTRYYDSGGYDYYFDPWGPGTLVTVSSGGGGSGGGGSGGGGSDVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISSLQPEDFATYYCQHTYITPYTFGQGTKVEIK'

In [70]:
scfv_df.iloc[0]['scfv_heavy_chain']

'QVQLVESGGGVVQPGESLKISCAASGFTFSSFGMHWVRQAPGKGLEWVAAISGSGGSTFYADSVKGRFTISRDNAHSNLYLEMQSLRAEDTAVYYCARSTRYYDSGGYDYYFDPWGPGTLVTVSS'

In [71]:
scfv_df.iloc[0]['scfv_light_chain']

'DVQMTQSPSSLSASAGDTVNITCQTSQHIRSSLAWYQQKSGQAPRLLIYGASSRATGIPDRFSGSGSGTDFTLTISSLQPEDFATYYCQHTYITPYTFGQGTKVEIK'