# Imports

In [1]:
from  utils import read_txt_file, write_csv_file
import pandas as pd

# Materials and Methods

## Datasets

The dataset **RB198** was employed as the training set, while **RB111** served as the independent set in this implementation.

 Both datasets were acquired from the following source: http://ailab-projects2.ist.psu.edu/RNABindRPlus/data.html

Originally, the data was structured as a text file in the following format:

```
#First line: PDBID and Chain ID
#Second line: Sequence
#Third line: Interface residues defined using a 5.0 angstrom distance cut-off
2XFZ_Y
KGFKDYGHDYHPAPKTENIKGLGDLKPGIPKTPKQNGGGKRKRWTGDKGRKIYEWDSQAGELEGYRASDGQHLGSFDPKTGNQLKGPDPKRNIKKYL
0000000011100000000000000000111011111111110000010110000111100000010110000000000000000000111101111
```

We needed to convert this data into a CSV file with the following features:

**PDBID**, **ChainID**, **Sequence**, **Interface**

The script processed this transformation.

In [13]:
rb198txt = 'Datasets/RB198.txt'
rb198 = 'Datasets/RB198.csv'

data = read_txt_file(rb198txt)
write_csv_file(data, rb198)

rb111txt = 'Datasets/RB111.txt'
rb111 = 'Datasets/RB111.csv'
data = read_txt_file(rb111txt)
write_csv_file(data, rb111)

The structure of the dataframe resembles:

In [15]:
train_data = pd.read_csv('Datasets/RB198.csv')
train_data.head(3)

Unnamed: 0,PDBID,ChainID,Sequence,Interface
0,2AZ0,A,MPSKLALIQELPDRIQTAVEAAMGMSYQDAPNNVRRDLDNLHACLN...,0000000000000000000000000000000010011001001100...
1,1M8V,A,GAMAERPLDVIHRSLDKDVLVILKKGFEFRGRLIGYDIHLNVVLAD...,0001110110010010000000000000000001111110000000...
2,2PJP,A,FSEEQQAIWQKAEPLFGDEPWWVRDLAKETGTDEQAMRLTLRQAAQ...,0000000000000000000001110000000001000100000000...


## Methodology

In this implementation, the proposed PRIP method comprised five steps:

1. Pre-training the Word2vec model.
2. Dividing protein sequences.
3. Extracting semantic features.
4. Training the XGBoost classifier.
5. Discerning between binding and non-binding sites.


## Word2Vec

In [None]:
import pandas as pd
from gensim.models import Word2Vec
from gensim.models.word2vec import PathLineSentences

# Load the RB198 dataset
rb198_path = 'your_path_here/RB198.csv'  # Replace with your actual path
rb198_data = pd.read_csv(rb198_path)

# Tokenize the sequences into "words" (here, each amino acid is considered a word)
tokenized_sequences = [list(sequence) for sequence in train_data['Sequence']]

# Define the Word2vec model with the specified parameters
model = Word2Vec(sentences=tokenized_sequences,
                vector_size=25,           # Dimensionality of the word vectors
                window=5,                 # Maximum distance between the current and predicted word within a sentence
                min_count=1,              # Ignores all words with total frequency lower than this
                sg=0,                     # Use CBOW model
                negative=5,               # Number of negative samples
                epochs=200,               # Number of iterations (epochs) over the corpus
                workers=1)                # Number of worker threads

# Save the model for later use or to load it in another environment
model.save('/mnt/data/word2vec_protein_sequences.model')

# Example usage: Getting the vector for a specific amino acid
vector_for_amino_acid_A = model.wv['A']
print(vector_for_amino_acid_A)


In [56]:
def split_protein_sequence(protein_sequence, segment_length):
    
    if segment_length % 2 == 0 :
        raise ValueError("Segment length must be an odd number and greater than or equal to 2*n + 1")

    segments = []
    sequence_length = len(protein_sequence)

    n = (segment_length - 1)// 2

    # Iterate over each residue in the protein sequence
    for i in range(sequence_length):
        # Determine the start and end indices of the segment
        start_index = max(0, i - n)
        end_index = min(sequence_length, i + n + 1)

        # Pad the segment with "X" characters at the start or end if needed
        if start_index == 0:
            padded_segment = "X" * (n - i) + protein_sequence[:end_index]
        elif end_index == sequence_length:
            padded_segment = protein_sequence[start_index:] + "X" * (n - (sequence_length - 1 - i))
        else:
            segment = protein_sequence[start_index:end_index]
            padded_segment = "X" * (n - (i - start_index)) + segment

        # Add the segment to the list of segments
        segments.append(padded_segment)

    return segments

In [63]:
# Concatenate semantic vectors for each residue
def feature_extract(segments , W2V_model):

    semantic_vect_concat = []
    for segment in segments :
        seg_vect = []
        for residue in segment : 
            if residue != 'X' : 
                seg_vect.extend(W2V_model.wv[residue])
            else: 
                seg_vect.extend([0 for i in range(25)])
        semantic_vect_concat.append(seg_vect)

    return semantic_vect_concat
                 

In [36]:
from gensim.models import Word2Vec

def load_model(path):
    return Word2Vec.load(path)


In [38]:
model = load_model("model_W2V.model")

df_train = pd.read_csv("Datasets/RB198.csv")
df_test = pd.read_csv("Datasets/RB111.csv")

protein_seqs_train = df_train['Sequence'].to_list()
label_interfaces_train = df_train['Interface'].to_list()


protein_seqs_test= df_test['Sequence'].to_list()
label_interfaces_test = df_test['Interface'].to_list()

pairs = list(zip(protein_seqs_train , label_interfaces_train))


In [62]:
# Sequence splitting + assigning the interface labels for each split
X , y = [] , []
segment_length = 11
n = 3
for elem in pairs : 
    prot_sequence = elem[0]
    label_interf = elem[1]

    if len(prot_sequence) < segment_length : 
        print("inf")
    segments = split_protein_sequence(prot_sequence ,segment_length)

    # X represents the segments (sequences split) and y represents the respective labels
    X.extend(segments)
    y.extend(label_interf)
    
len(X)

53660

In [64]:
semantic_vects = feature_extract(X , model)

In [66]:
semantic_vects

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0.0032008933,
  0.0029866076,
  0.02191154,
  -0.034424316,
  0.002337823,
  0.02747769,
  0.008926378,
  0.0044987057,
  -0.03728862,
  0.033929467,
  -0.02505651,
  -0.011969495,
  0.013975148,
  -0.0030905104,
  0.005645165,
  0.0071279667,
  -0.027315598,
  -0.038899247,
  0.03616234,
  0.024792219,
  -0.02765171,
  0.01361393,
  0.0008242559,
  0.019014983,
  -0.028