In [11]:
from collections import defaultdict

import pandas as pd
import numpy as np
import networkx as nx
import scipy as sp

import matplotlib.pyplot as plt
import seaborn as sns

import torch
import torch.nn as nn
import torch.nn.functional as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Data

In [2]:
data_human = pd.read_csv("https://shiru-public.s3.us-west-2.amazonaws.com/PiNUI/PiNUI-human.csv")
data_yeast = pd.read_csv("https://shiru-public.s3.us-west-2.amazonaws.com/PiNUI/PiNUI-yeast.csv")


In [3]:
print(len(data_human))
print('---'*10)
print(data_human.head())
print('---'*10)
print(data_human.describe())
print('---'*10)
print(len(data_human['seqA'].unique()))
print(len(data_human['seqB'].unique()))

684951
------------------------------
                                                seqA  \
0  MKRRASDRGAGETSARAKALGSGISGNNAKRAGPFILGPRLGNSPV...   
1  MEAPSGSEPGGDGAGDCAHPDPRAPGAAAPSSGPGPCAAARESERQ...   
2  MDQNSVPEKAQNEADTNNADRFFRSHSSPPHHRPGHSRALHHYELH...   
3  MFADLDYDIEEDKLGIPTVPGKVTLQKDAQNLIGISIGGGAQYCPC...   
4  MAEGNHRKKPLKVLESLGKDFLTGVLDNLVEQNVLNWKEEEKKKYY...   

                                                seqB  interaction  
0  MAASAARGAAALRRSINQPVAFVRRIPWTAASSQLKEHFAQFGHVR...            1  
1  MKLFHTADWHLGKLVHGVYMTEDQKIVLDQFVQAVEEEKPDAVIIA...            1  
2  MTHCCSPCCQPTCCRTTCWQPTTVTTCSSTPCCQPSCCVSSCCQPC...            1  
3  MARTLRPSPLCPGGGKAQLSSASLLGAGLLLQPPTPPPLLLLLFPL...            1  
4  MASADSRRVADGGGAGGTFQPYLDTLRQELQQTDPTLLSVVVAVLA...            1  
------------------------------
         interaction
count  684951.000000
mean        0.333333
std         0.471405
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.0000

In [4]:
connections = {
    (row.seqA, row.seqB): row.interaction
    for row in data_human.itertuples(index=False) if row.interaction
}


connections_network = defaultdict(list) 

for row in data_human.itertuples(index=False):
    if row.interaction:
        if row.seqB not in connections_network[row.seqA]:
            connections_network[row.seqA].append(row.seqB)


network_pd = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in connections_network.items()]))

In [5]:
print(len(data_yeast))
print('---'*10)
print(data_yeast.head())
print('---'*10)
print(data_yeast.describe())
print('---'*10)
print(len(data_yeast['seqA'].unique()))
print(len(data_yeast['seqB'].unique()))

159111
------------------------------
                                                seqA  \
0  MVKETKFYDILGVPVTATDVEIKKAYRKCALKYHPDKNPSEEAAEK...   
1  MSAPAANGEVPTFKLVLVGDGGTGKTTFVKRHLTGEFEKKYIATIG...   
2  MFFSKVMLTRRILVRGLATAKSSAPKLTDVLIVGGGPAGLTLAASI...   
3  MSHSGAAIFEKVSGIIAINEDVSPAELTWRSTDGDKVHTVVLSTID...   
4  MAETSLLEAGASAASTAAALENLQVEASCSVCLEYLKEPVIIECGH...   

                                                seqB  interaction  
0  MYYGISQFSEAYNKILRNSSSHSSCQLVIFVSCLNIDALCATKMLS...            1  
1  MVKRTVATNGDASGAHRAKKMSKTHASHIINAQEDYKHMYLSVQPL...            1  
2  MIPKLYIHLILSLLLLPLILAQDYYAILEIDKDATEKEIKSAYRQL...            1  
3  MAEGVFQGAIGIDLGTTYSCVATYESSVEIIANEQGNRVTPSFVAF...            1  
4  MEKKHVTVQIQSAPPSYIKLEANEKFVYITSTMNGLSYQIAAIVSY...            1  
------------------------------
         interaction
count  159111.000000
mean        0.333333
std         0.471406
min         0.000000
25%         0.000000
50%         0.000000
75%         1.000000
max         1.0000

## Dataset Prep 

In [27]:
def encode_sequence(seq, max_length=1000):
    # Define a mapping for common amino acids
    amino_acids = "ACDEFGHIKLMNPQRSTVWY"
    aa_to_int = {aa: i + 1 for i, aa in enumerate(amino_acids)}  # reserve 0 for padding

    # Convert each amino acid in the sequence to its corresponding integer
    encoded = [aa_to_int.get(aa, 0) for aa in seq]  # default to 0 if amino acid not found

    # Pad or truncate the sequence to max_length
    if len(encoded) < max_length:
        encoded += [0] * (max_length - len(encoded))
    else:
        encoded = encoded[:max_length]
    return encoded


class PiNUIDataset(Dataset):
    def __init__(self, seqA, seqB, targets, max_length=1000):
        # Encode sequences from strings to numerical lists
        self.seqA = [encode_sequence(seq, max_length) for seq in seqA]
        self.seqB = [encode_sequence(seq, max_length) for seq in seqB]
        self.targets = targets

        # Convert the numerical lists and targets to tensors
        self.seqA = torch.tensor(self.seqA, dtype=torch.float32)
        self.seqB = torch.tensor(self.seqB, dtype=torch.float32)
        self.targets = torch.tensor(self.targets, dtype=torch.float32)

    def __len__(self):
        return len(self.targets)

    def __getitem__(self, idx):
        # Return a tuple for clarity
        return (self.seqA[idx], self.seqB[idx]), self.targets[idx]
    
# Prepare dataset for training
def prepare_data(train_df, test_df, target='interaction', batch_size=32, max_length=1000):
    # Extract sequences and target values
    train_seqA = train_df['seqA'].values
    train_seqB = train_df['seqB'].values
    y_train = train_df[target].values
  
    test_seqA = test_df['seqA'].values
    test_seqB = test_df['seqB'].values
    y_test = test_df[target].values
  
    # Create datasets with encoding
    train_dataset = PiNUIDataset(train_seqA, train_seqB, y_train, max_length)
    test_dataset = PiNUIDataset(test_seqA, test_seqB, y_test, max_length)
  
    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
  
    return train_loader, test_loader


In [28]:
# Load dataset
print("Loading Data...")
data_human = pd.read_csv("https://shiru-public.s3.us-west-2.amazonaws.com/PiNUI/PiNUI-human.csv")

Loading Data...


In [None]:
train_val_proteins, test_proteins = train_test_split(data_human, train_size=0.8)

# Prepare dataset
print("Preparing dataset")
train_loader, test_loader = prepare_data(
    train_val_proteins, test_proteins, target='interaction', batch_size=32
)

Preparing dataset
Preparing dataset


In [34]:
data_iter = iter(train_loader)
first_batch_data, first_batch_labels = next(data_iter)
print(first_batch_data)
print(first_batch_labels)

[tensor([[11., 16., 13.,  ...,  0.,  0.,  0.],
        [11.,  1.,  1.,  ...,  0.,  0.,  0.],
        [11., 14., 15.,  ...,  0.,  0.,  0.],
        ...,
        [11.,  1., 17.,  ...,  0.,  0.,  0.],
        [11.,  1., 16.,  ...,  0.,  0.,  0.],
        [11.,  1., 13.,  ...,  0.,  0.,  0.]]), tensor([[11.,  1., 13.,  ..., 18.,  9.,  8.],
        [11., 16.,  9.,  ...,  0.,  0.,  0.],
        [11., 16.,  9.,  ...,  0.,  0.,  0.],
        ...,
        [11., 17., 11.,  ...,  0.,  0.,  0.],
        [11.,  3., 16.,  ...,  0.,  0.,  0.],
        [11., 12., 18.,  ..., 14., 14.,  6.]])]
tensor([0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 1., 0.,
        0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0.])
