In [3]:
#!/usr/bin/python3 
from hyperparams import *
from rawstruct_preprocessing_debug import *
from rawdata_preprocessing import read_RPI_pairSeq, read_NPInter_pairSeq
from copy import deepcopy
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pylab as plt
import sys
#from struct_processing import *

from sklearn.preprocessing import StandardScaler
import math
import string
from functools import reduce

import numpy as np
%matplotlib inline

isPrint = True



# encoder for protein sequence
class ProEncoder:
    elements = 'AIYHRDC'
    structs = 'hec'

    element_number = 7
    # number of structure kind
    struct_kind = 3

    # clusters: {A,G,V}, {I,L,F,P}, {Y,M,T,S}, {H,N,Q,W}, {R,K}, {D,E}, {C}
    pro_intab = 'AGVILFPYMTSHNQWRKDEC'
    pro_outtab = 'AAAIIIIYYYYHHHHRRDDC'

    def __init__(self, WINDOW_P_UPLIMIT, WINDOW_P_STRUCT_UPLIMIT, CODING_FREQUENCY, VECTOR_REPETITION_CNN,
                 TRUNCATION_LEN=None, PERIOD_EXTENDED=None):
        #WINDOW_P_UPLIMIT : protein feature를 최대 몇자리까지 쓸것인가를 저장한 상수. 3
        #WINDOW_P_STRUCT_UPLIMIT : struct 정보의 protein feature를 최대 몇자리까지 쓸것인가를 저장한 상수. 3
        #CODING_FREQUENCY : 전역상수. True.
        #VECTOR_REPETITION_CNN : 전역상수. 1.
        
        self.WINDOW_P_UPLIMIT = WINDOW_P_UPLIMIT
        self.WINDOW_P_STRUCT_UPLIMIT = WINDOW_P_STRUCT_UPLIMIT
        self.CODING_FREQUENCY = CODING_FREQUENCY
        self.VECTOR_REPETITION_CNN = VECTOR_REPETITION_CNN

        self.TRUNCATION_LEN = TRUNCATION_LEN
        self.PERIOD_EXTENDED = PERIOD_EXTENDED

        # list and position map for k_mer
        k_mers = ['']
        self.k_mer_list = []
        self.k_mer_map = {}
        for T in range(self.WINDOW_P_UPLIMIT): # 3
            temp_list = []
            for k_mer in k_mers:
                for x in self.elements:# AIYHRDC 
                    temp_list.append(k_mer + x)
            k_mers = temp_list
            self.k_mer_list += temp_list
        for i in range(len(self.k_mer_list)):
            self.k_mer_map[self.k_mer_list[i]] = i

        # list and position map for k_mer structure
        k_mers = ['']
        self.k_mer_struct_list = []
        self.k_mer_struct_map = {}
        for T in range(self.WINDOW_P_STRUCT_UPLIMIT):
            temp_list = []
            for k_mer in k_mers: 
                for s in self.structs:
                    temp_list.append(k_mer + s)
            k_mers = temp_list
            self.k_mer_struct_list += temp_list
        for i in range(len(self.k_mer_struct_list)):
            self.k_mer_struct_map[self.k_mer_struct_list[i]] = i

        # table for amino acid clusters
        self.transtable = str.maketrans(self.pro_intab, self.pro_outtab)
        
        #k_mer_map : feature들을 key값으로, 그것의 index를 value값으로 가지는 딕셔너리
        # k_mer_map 0~6 : 한 자리 알파벳으로 이루어진 원소
        # k_mer_map 7~55 : 두 자리 알파벳으로 이루어진 원소
        # k_mer_map 56~398 : 세 자리 알파벳으로 이루어진 원소
        
        #k_mer_list : feature들이 sorting되어 있는 리스트

        # print(len(self.k_mer_list))
        # print(self.k_mer_list)
        #print('self.k_mer_map : {}'.format(self.k_mer_map))
        # print(len(self.k_mer_struct_list))
        # print(self.k_mer_struct_list)


    def encode_conjoint(self, seq): 
        # sequence에서 각각의 feature들이 포함되는 횟수를 세서 정규화시킴. improved CTF. 우리 프로젝트랑 똑같음.
        # 정규화시키는 방법이 다름. min_max 정규화가 아니고, value를 최대값으로 나눔.
        
        seq = seq.translate(self.transtable) # seq는 문자열 # 'AGVILFPYMTSHNQWRKDEC' -> 'AAAIIIIYYYYHHHHRRDDC' 이렇게 바꿈.
        #print('seq before join : {}'.format(seq)) # 이건 AIYHRDC로만 이루어졌나 아닌가 체크하는 로직인 듯
        #seq = ''.join([x for x in seq if x in self.elements]) # seq는 문자열.
        #print('seq after_ join : {}'.format(seq))
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'
        result = []
        offset = 0
        for K in range(1, self.WINDOW_P_UPLIMIT + 1): # range(1, 4)
            # K는 feature의 길이임. 
            
            vec = [0.0] * (self.element_number ** K) 
            # vec배열을 7**K 개의 0.0 (float)가 담긴 배열로 초기화
            # element_number : 7
            
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1): # K=1일 때는 sequence의 length만큼 순회. K=2일 때는 sequence의 length - 1만큼 순회. K=일때는 ...
                k_mer = seq[i:i + K] # feature를 순회하면서 K 길이의 문자열을 추출한거.
                vec[self.k_mer_map[k_mer] - offset] += 1 # vec 리스트에서 k_mer의 인덱스에 해당하는 자리에 카운트를 1 올림.
            vec = np.array(vec)
            offset += vec.size # K=1 일 때 vec.size = 7, K=2일 때 vec.size = 49, K=3일 때 vec.size = 343
            #print('self.k_mer_map[k_mer] : {}'.format(self.k_mer_map[k_mer]))
            #print('vec : {0} - vec.size : {1}'.format(vec, vec.size))
            if self.CODING_FREQUENCY:
                vec = vec / vec.max()
            result += list(vec)
            #print('len of result : {}'.format(len(result)))
        #print('result : {}'.format(result))
        
        # result 0~6 : 한 자리 알파벳으로 이루어진 원소
        # result 7~55 : 두 자리 알파벳으로 이루어진 원소
        # result 56~398 : 세 자리 알파벳으로 이루어진 원소
        return np.array(result)

    def encode_conjoint_struct(self, seq, struct):
        # seq length와 struct length는 같음. 헐.

        
        seq = seq.translate(self.transtable) # seq는 문자열 # 'AGVILFPYMTSHNQWRKDEC' -> 'AAAIIIIYYYYHHHHRRDDC' 이렇게 바꿈.
        seq_temp = []
        struct_temp = []
        for i in range(len(seq)):     
            if seq[i] in self.elements:
                # AIYHRDC 의 원소가 AIYHRDC안에 있으면, 0~len(seq)-1 의 모든 인덱스에 대해 translate된 seq[i]와 원본 struct[i]를 배열로 보관함.
                
                seq_temp.append(seq[i])
                struct_temp.append(struct[i])
        seq = ''.join(seq_temp) # 여기의 seq는 translate된 seq와 같음. (그냥 검증로직인듯)
        struct = ''.join(struct_temp) # 그냥 원본 struct와 같음.
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'

        
        # encode_conjoint의 sequence 인코딩 방식과 정확하게 동일함.
        result_seq = []
        offset_seq = 0
        for K in range(1, self.WINDOW_P_UPLIMIT + 1):
            vec_seq = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec_seq[self.k_mer_map[k_mer] - offset_seq] += 1 # vec 리스트에서 k_mer의 인덱스에 해당하는 자리에 카운트를 1 올림.
            vec_seq = np.array(vec_seq)
            offset_seq += vec_seq.size
            if self.CODING_FREQUENCY:
                vec_seq = vec_seq / vec_seq.max()
            result_seq += list(vec_seq)


        result_struct = []
        offset_struct = 0
        for K in range(1, self.WINDOW_P_STRUCT_UPLIMIT + 1):
            vec_struct = [0.0] * (self.struct_kind ** K)
            # vec배열을 3^K 개의 0.0 (float)가 담긴 배열로 초기화
            # element_number : 3
            
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer_struct = struct[i:i + K]
                vec_struct[self.k_mer_struct_map[k_mer_struct] - offset_struct] += 1
            vec_struct = np.array(vec_struct)
            offset_struct += vec_struct.size
            if self.CODING_FREQUENCY:
                vec_struct = vec_struct / vec_struct.max()
            result_struct += list(vec_struct)
            
        # sequence를 정규화한 배열과 struct를 정규화한 배열을 concatenate시킴.
        # result_seq len : 399
        # result_struct len : 39 -> 3 + 9 + 27. feature의 알파벳 개수가 3개여서 그럼.
        # 결과 : 438 
        return np.array(result_seq + result_struct)
    '''
    def encode_conjoint_cnn(self, seq):
        result_t = self.encode_conjoint(seq)
        result = np.array([[x] * self.VECTOR_REPETITION_CNN for x in result_t])
        return result

    def encode_conjoint_struct_cnn(self, seq, struct):
        result_t = self.encode_conjoint_struct(seq, struct)
        result = np.array([[x] * self.VECTOR_REPETITION_CNN for x in result_t])
        return result
    '''


# encoder for RNA sequence
class RNAEncoder:
    elements = 'AUCG'
    structs = '.('

    element_number = 4
    struct_kind = 2

    def __init__(self, WINDOW_R_UPLIMIT, WINDOW_R_STRUCT_UPLIMIT, CODING_FREQUENCY, VECTOR_REPETITION_CNN,
                 TRUNCATION_LEN=None, PERIOD_EXTENDED=None):

        self.WINDOW_R_UPLIMIT = WINDOW_R_UPLIMIT
        self.WINDOW_R_STRUCT_UPLIMIT = WINDOW_R_STRUCT_UPLIMIT
        self.CODING_FREQUENCY = CODING_FREQUENCY
        self.VECTOR_REPETITION_CNN = VECTOR_REPETITION_CNN

        self.TRUNCATION_LEN = TRUNCATION_LEN
        self.PERIOD_EXTENDED = PERIOD_EXTENDED

        # list and position map for k_mer
        k_mers = ['']
        self.k_mer_list = []
        self.k_mer_map = {}
        for T in range(self.WINDOW_R_UPLIMIT):
            temp_list = []
            for k_mer in k_mers:
                for x in self.elements:
                    temp_list.append(k_mer + x)
            k_mers = temp_list
            self.k_mer_list += temp_list
        for i in range(len(self.k_mer_list)):
            self.k_mer_map[self.k_mer_list[i]] = i

        # list and position map for k_mer structure
        k_mers = ['']
        self.k_mer_struct_list = []
        self.k_mer_struct_map = {}
        for T in range(self.WINDOW_R_STRUCT_UPLIMIT):
            temp_list = []
            for k_mer in k_mers:
                for s in self.structs:
                    temp_list.append(k_mer + s)
            k_mers = temp_list
            self.k_mer_struct_list += temp_list
        for i in range(len(self.k_mer_struct_list)):
            self.k_mer_struct_map[self.k_mer_struct_list[i]] = i

        # print(len(self.k_mer_list))
        # print(self.k_mer_list)
        # print(len(self.k_mer_struct_list))
        # print(self.k_mer_struct_list)

    def encode_conjoint(self, seq):
        seq = seq.replace('T', 'U')
        seq = ''.join([x for x in seq if x in self.elements])
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'
        result = []
        offset = 0
        for K in range(1, self.WINDOW_R_UPLIMIT + 1):
            vec = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec[self.k_mer_map[k_mer] - offset] += 1
            vec = np.array(vec)
            offset += vec.size
            if self.CODING_FREQUENCY:
                vec = vec / vec.max()
            result += list(vec)
        return np.array(result)

    def encode_conjoint_struct(self, seq, struct):
        seq = seq.replace('T', 'U')
        struct = struct.replace(')', '(')
        seq_temp = []
        struct_temp = []
        for i in range(len(seq)):
            if seq[i] in self.elements:
                seq_temp.append(seq[i])
                struct_temp.append(struct[i])
        seq = ''.join(seq_temp)
        struct = ''.join(struct_temp)
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'

        result_seq = []
        offset_seq = 0
        for K in range(1, self.WINDOW_R_UPLIMIT + 1):
            vec_seq = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec_seq[self.k_mer_map[k_mer] - offset_seq] += 1
            vec_seq = np.array(vec_seq)
            offset_seq += vec_seq.size
            if self.CODING_FREQUENCY:
                vec_seq = vec_seq / vec_seq.max()
            result_seq += list(vec_seq)


        result_struct = []
        offset_struct = 0
        for K in range(1, self.WINDOW_R_STRUCT_UPLIMIT + 1):
            vec_struct = [0.0] * (self.struct_kind ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer_struct = struct[i:i + K]
                vec_struct[self.k_mer_struct_map[k_mer_struct] - offset_struct] += 1
            vec_struct = np.array(vec_struct)
            offset_struct += vec_struct.size
            if self.CODING_FREQUENCY:
                vec_struct = vec_struct / vec_struct.max()
            result_struct += list(vec_struct)
        return np.array(result_seq + result_struct)
    '''
    def encode_conjoint_cnn(self, seq):
        result_t = self.encode_conjoint(seq)
        result = np.array([[x] * self.VECTOR_REPETITION_CNN for x in result_t])
        return result

    def encode_conjoint_struct_cnn(self, seq, struct):
        result_t = self.encode_conjoint_struct(seq, struct)
        result = np.array([[x] * self.VECTOR_REPETITION_CNN for x in result_t])
        return result
    '''

def standardization(X):
    # https://datascienceschool.net/view-notebook/f43be7d6515b48c0beb909826993c856/
    # StandardScalar : 평균이 0과 표준편차가 1이 되도록 변환.
    scaler = StandardScaler()
    scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

def coding_pairs(pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind):
    # pair (p_sequence, r_sequence)에서 각 feature를 순회하면서 feature들의 value를 추출하고, 그것을 정규화시킨 것을 배열로 만듬.
    # p_sequence에 해당하는 p_struct를 인코딩해서 배열로 만들고, p_sequence의 값과 concatenate시킴.
    # kind = 1 (positive) or 0 (negative)인 Flag
    #print('woojung1')
    samples = []
    for pr in pairs:
        #print('kind : {0} - pair : {1} - struct : {2}'.format(kind, pr, pr[0] in pro_seqs and pr[1] in rna_seqs and pr[0] in pro_structs and pr[1] in rna_structs))
        if pr[0] in pro_seqs and pr[1] in rna_seqs and pr[0] in pro_structs and pr[1] in rna_structs:
            #print('woojung2')
            # 이 if 문은 결측치를 처리하기 위함임. 결측치가 포함된 pair를 단순히 제외시킨다.
            # pr[0] in pro_structs 라는게, pro_structs 딕셔너리의 key 값중 pr[0]이 포함되어있는지 보는것인 듯.
            p_seq = pro_seqs[pr[0]]  # protein sequence
            r_seq = rna_seqs[pr[1]]  # rna sequence
            p_struct = pro_structs[pr[0]]  # protein structure
            r_struct = rna_structs[pr[1]]  # rna structure

            #p_conjoint = PE.encode_conjoint(p_seq) # protein sequence를 인코딩함. feature마다 count된 value를 최대값으로 나눈 정규화 사용. 그외 동일
            #r_conjoint = RE.encode_conjoint(r_seq)
            #print('woojung3')
            p_conjoint_struct = PE.encode_conjoint_struct(p_seq, p_struct)
            # struct 파일도 sequence와 완전 동일한 방법으로 인코딩. result인, '정규화된 값으로 구성된 배열'들을 concatenate시켜서 결과로 리턴.
        
            r_conjoint_struct = RE.encode_conjoint_struct(r_seq, r_struct)
            #print('woojung4')
            

        
            samples.append([
                            [p_conjoint_struct, r_conjoint_struct],
                            kind
                            ])
        else:
            print('Skip pair {} according to sequence dictionary.'.format(pr))
            
    # samples (4차원 배열) : "[[p_conjoint_struct, r_conjoint_struct], kind]" 이게 원소로 들어가있음.
    # p_conjoint, p_conjoint_struct는 feature들의 value가 정규화되어 들어가있는 1차원 배열.
    return samples

def pre_process_data(samples, samples_pred=None):
    # parameter samples는 아래와 같이 생겼음.
    # [ [[p1_conjoint, r1_conjoint],[p1_conjoint_struct, r1_conjoint_struct],kind], 
    #   [[p2_conjoint, r2_conjoint],[p2_conjoint_struct, r2_conjoint_struct],kind],
    #   [[p3_conjoint, r3_conjoint],[p3_conjoint_struct, r3_conjoint_struct],kind],
    #    ...
    #   [[pN_conjoint, rN_conjoint],[pN_conjoint_struct, rN_conjoint_struct],kind]
    # ]
    
    # np.random.shuffle(samples)
    #print('samples : {}'.format(samples))

    #p_conjoint = np.array([x[0][0] for x in samples]) # x[?][0][0] : p_conjoint
    #r_conjoint = np.array([x[0][1] for x in samples]) # x[?][0][1] : r_conjoint
    p_conjoint_struct = np.array([x[0][0] for x in samples]) # x[?][1][0] : p_conjoint_struct
    r_conjoint_struct = np.array([x[0][1] for x in samples]) # x[?][1][1] : r_conjoint_struct
    y_samples = np.array([x[1] for x in samples]) # x[2] : kind
    # p_conjoint (2차원 배열) (이 시점에서의 p_conjoint) : [p1_conjoint, p2_conjoint, p3_conjoint ... ]
    # r_conjoint (2차원 배열) (이 시점에서의 r_conjoint) : [r1_conjoint, r2_conjoint, r3_conjoint ... ]
    # p_conjoint_struct (2차원 배열) (이 시점에서의 p_conjoint_struct) : [p1_conjoint_struct, p2_conjoint_struct, ... ]
    # r_conjoint_struct (2차원 배열) (이 시점에서의 r_conjoint_struct) : [r1_conjoint_struct, r2_conjoint_struct, ... ]
    
    # p_conjoint length : 488
    # p_conjoint[0] length : 343 

    #print('before standardization : {}'.format(p_conjoint)) 
    #p_conjoint, scaler_p = standardization(p_conjoint)
    #print('after standardization - p_conjoint : {0} - scaler_p : {1}'.format(p_conjoint, scaler_p))
    
    #r_conjoint, scaler_r = standardization(r_conjoint)
    p_conjoint_struct, scaler_p_struct = standardization(p_conjoint_struct)
    r_conjoint_struct, scaler_r_struct = standardization(r_conjoint_struct)

    #print('p_conjoint : {0} - len of p_conjoint : {1} - len of p_conjoint[0] : {2}'.format(p_conjoint, len(p_conjoint), len(p_conjoint[0])))
    #p_conjoint_cnn = np.array([list(map(lambda e: [e] * VECTOR_REPETITION_CNN, x)) for x in p_conjoint])
    # map과 람다 함수 : map(function, iterator) -> iterator (e.g. list, tuple, ... )의 각 요소를 function의 파라미터로 넣어서 실행시킨다.
    # p_conjoint의 각 인자와 VECTOR_REPETITION_CNN를 곱함.
    #print('p_conjoint_cnn : {}'.format(p_conjoint_cnn))
    
    #r_conjoint_cnn = np.array([list(map(lambda e: [e] * VECTOR_REPETITION_CNN, x)) for x in r_conjoint])
    #p_conjoint_struct_cnn = np.array([list(map(lambda e: [e] * VECTOR_REPETITION_CNN, x)) for x in p_conjoint_struct])
    #r_conjoint_struct_cnn = np.array([list(map(lambda e: [e] * VECTOR_REPETITION_CNN, x)) for x in r_conjoint_struct])

    #p_ctf_len = 7 ** WINDOW_P_UPLIMIT # WINDOW_P_UPLIMIT : 3, 7 ** WINDOW_P_UPLIMI : 343
    #r_ctf_len = 4 ** WINDOW_R_UPLIMIT # WINDOW_R_UPLIMIT : 4, 4 ** WINDOW_R_UPLIMIT : 256
    #p_conjoint_previous = np.array([x[-p_ctf_len:] for x in p_conjoint]) 
    # 각 p_sequence의 정규화 배열에 대해 인덱스가 뒤에서부터 343번째 까지인 것만 배열로 만듬. -> 세자리 알파벳인 feature의 정규값 원소만 뽑겠다!
    # 파이썬 배열 인덱싱 - 마이너스가 붙으면 뒤에서부터 탐색. 
    # p_sequence 배열의 인덱스의 의미 !!
    # p_sequence 0~6 : 한 자리 알파벳으로 이루어진 원소
    # p_sequence 7~55 : 두 자리 알파벳으로 이루어진 원소
    # p_sequence 56~398 : 세 자리 알파벳으로 이루어진 원소
    
    #print('p_conjoint_previous : {0} - p_ctf_len : {1} - len of p_conjoint_previous[0] : {2}'.format(p_conjoint_previous, p_ctf_len, len(p_conjoint_previous[0])))
    
    #r_conjoint_previous = np.array([x[-r_ctf_len:] for x in r_conjoint])
    '''
    X_samples = [[p_conjoint, r_conjoint],
                 [p_conjoint_struct, r_conjoint_struct],
                 [p_conjoint_cnn, r_conjoint_cnn],
                 [p_conjoint_struct_cnn, r_conjoint_struct_cnn],
                 [p_conjoint_previous, r_conjoint_previous]
                 ]
    '''
    #X_samples = [p_conjoint_struct, r_conjoint_struct]


    #return X_samples, y_samples
    return p_conjoint_struct, r_conjoint_struct, y_samples


# Reduced Protein letters(7 letters)
def get_reduced_protein_letter_dict():
    rpdict = {}
    reduced_letters = [["A","G","V"],
                       ["I","L","F","P"],
                       ["Y","M","T","S"],
                       ["H","N","Q","W"],
                       ["R","K"],
                       ["D","E"],
                       ["C"]]
    changed_letter = ["A","B","C","D","E","F","G"]
    for class_idx, class_letters in enumerate(reduced_letters):
        for letter in class_letters:
            rpdict[letter] = changed_letter[class_idx]
                
    #rpdict = {"A" : ["A","G","V"], "B" : ["I","L","F","P"], ... }
    return rpdict

# Improved CTF 
class improvedCTF:
    def __init__(self, letters, length):
        self.letters = letters
        self.length = length
        self.dict = {}
        self.generate_feature_dict()
        
    def generate_feature_dict(self):
        def generate(cur_key, depth):
            if depth == self.length:
                return
            for k in self.letters:
                next_key = cur_key + k
                self.dict[next_key] = 0
                generate(next_key, depth+1)
                
        generate(cur_key="",depth=0)
        
        if isPrint:
            print("iterate letters : {}".format(self.letters))
            print("number of keys  : {}".format(len(self.dict.keys())))
        
    
    def get_feature_dict(self):
        for k in self.dict.keys():
            self.dict[k] = 0
            
        return deepcopy(self.dict)

class ProEncoder:
    elements = 'AIYHRDC'
    structs = 'hec'

    element_number = 7
    # number of structure kind
    struct_kind = 3

    # clusters: {A,G,V}, {I,L,F,P}, {Y,M,T,S}, {H,N,Q,W}, {R,K}, {D,E}, {C}
    pro_intab = 'AGVILFPYMTSHNQWRKDEC'
    pro_outtab = 'AAAIIIIYYYYHHHHRRDDC'

    def __init__(self, WINDOW_P_UPLIMIT=3, WINDOW_P_STRUCT_UPLIMIT=3, CODING_FREQUENCY=True, VECTOR_REPETITION_CNN=1,
                 TRUNCATION_LEN=None, PERIOD_EXTENDED=None):
        #WINDOW_P_UPLIMIT : protein feature를 최대 몇자리까지 쓸것인가를 저장한 상수. 3
        #WINDOW_P_STRUCT_UPLIMIT : struct 정보의 protein feature를 최대 몇자리까지 쓸것인가를 저장한 상수. 3
        #CODING_FREQUENCY : 전역상수. True.
        #VECTOR_REPETITION_CNN : 전역상수. 1.
        
        self.WINDOW_P_UPLIMIT = WINDOW_P_UPLIMIT
        self.WINDOW_P_STRUCT_UPLIMIT = WINDOW_P_STRUCT_UPLIMIT
        self.CODING_FREQUENCY = CODING_FREQUENCY
        self.VECTOR_REPETITION_CNN = VECTOR_REPETITION_CNN

        self.TRUNCATION_LEN = TRUNCATION_LEN
        self.PERIOD_EXTENDED = PERIOD_EXTENDED

        # list and position map for k_mer
        k_mers = ['']
        self.k_mer_list = []
        self.k_mer_map = {}
        for T in range(self.WINDOW_P_UPLIMIT): # 3
            temp_list = []
            for k_mer in k_mers:
                for x in self.elements:# AIYHRDC 
                    temp_list.append(k_mer + x)
            k_mers = temp_list
            self.k_mer_list += temp_list
        for i in range(len(self.k_mer_list)):
            self.k_mer_map[self.k_mer_list[i]] = i

        # list and position map for k_mer structure
        k_mers = ['']
        self.k_mer_struct_list = []
        self.k_mer_struct_map = {}
        for T in range(self.WINDOW_P_STRUCT_UPLIMIT):
            temp_list = []
            for k_mer in k_mers: 
                for s in self.structs:
                    temp_list.append(k_mer + s)
            k_mers = temp_list
            self.k_mer_struct_list += temp_list
        for i in range(len(self.k_mer_struct_list)):
            self.k_mer_struct_map[self.k_mer_struct_list[i]] = i

        # table for amino acid clusters
        self.transtable = str.maketrans(self.pro_intab, self.pro_outtab)
        
        #k_mer_map : feature들을 key값으로, 그것의 index를 value값으로 가지는 딕셔너리
        # k_mer_map 0~6 : 한 자리 알파벳으로 이루어진 원소
        # k_mer_map 7~55 : 두 자리 알파벳으로 이루어진 원소
        # k_mer_map 56~398 : 세 자리 알파벳으로 이루어진 원소
        
        #k_mer_list : feature들이 sorting되어 있는 리스트

        # print(len(self.k_mer_list))
        # print(self.k_mer_list)
        #print('self.k_mer_map : {}'.format(self.k_mer_map))
        # print(len(self.k_mer_struct_list))
        # print(self.k_mer_struct_list)


    def encode_conjoint(self, seq): 
        # sequence에서 각각의 feature들이 포함되는 횟수를 세서 정규화시킴. improved CTF. 우리 프로젝트랑 똑같음.
        # 정규화시키는 방법이 다름. min_max 정규화가 아니고, value를 최대값으로 나눔.
        
        seq = seq.translate(self.transtable) # seq는 문자열 # 'AGVILFPYMTSHNQWRKDEC' -> 'AAAIIIIYYYYHHHHRRDDC' 이렇게 바꿈.
        #print('seq before join : {}'.format(seq)) # 이건 AIYHRDC로만 이루어졌나 아닌가 체크하는 로직인 듯
        #seq = ''.join([x for x in seq if x in self.elements]) # seq는 문자열.
        #print('seq after_ join : {}'.format(seq))
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'
        result = []
        offset = 0
        for K in range(1, self.WINDOW_P_UPLIMIT + 1): # range(1, 4)
            # K는 feature의 길이임. 
            
            vec = [0.0] * (self.element_number ** K) 
            # vec배열을 7**K 개의 0.0 (float)가 담긴 배열로 초기화
            # element_number : 7
            
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1): # K=1일 때는 sequence의 length만큼 순회. K=2일 때는 sequence의 length - 1만큼 순회. K=일때는 ...
                k_mer = seq[i:i + K] # feature를 순회하면서 K 길이의 문자열을 추출한거.
                vec[self.k_mer_map[k_mer] - offset] += 1 # vec 리스트에서 k_mer의 인덱스에 해당하는 자리에 카운트를 1 올림.
            vec = np.array(vec)
            offset += vec.size # K=1 일 때 vec.size = 7, K=2일 때 vec.size = 49, K=3일 때 vec.size = 343
            #print('self.k_mer_map[k_mer] : {}'.format(self.k_mer_map[k_mer]))
            #print('vec : {0} - vec.size : {1}'.format(vec, vec.size))
            if self.CODING_FREQUENCY:
                vec = vec / vec.max()
            result += list(vec)
            #print('len of result : {}'.format(len(result)))
        #print('result : {}'.format(result))
        
        # result 0~6 : 한 자리 알파벳으로 이루어진 원소
        # result 7~55 : 두 자리 알파벳으로 이루어진 원소
        # result 56~398 : 세 자리 알파벳으로 이루어진 원소
        return np.array(result)

    def encode_conjoint_struct(self, seq, struct):
        # seq length와 struct length는 같음. 헐.

        
        seq = seq.translate(self.transtable) # seq는 문자열 # 'AGVILFPYMTSHNQWRKDEC' -> 'AAAIIIIYYYYHHHHRRDDC' 이렇게 바꿈.
        seq_temp = []
        struct_temp = []
        for i in range(len(seq)):     
            if seq[i] in self.elements:
                # AIYHRDC 의 원소가 AIYHRDC안에 있으면, 0~len(seq)-1 의 모든 인덱스에 대해 translate된 seq[i]와 원본 struct[i]를 배열로 보관함.
                
                seq_temp.append(seq[i])
                struct_temp.append(struct[i])
        seq = ''.join(seq_temp) # 여기의 seq는 translate된 seq와 같음. (그냥 검증로직인듯)
        struct = ''.join(struct_temp) # 그냥 원본 struct와 같음.
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'

        '''
        # encode_conjoint의 sequence 인코딩 방식과 정확하게 동일함.
        result_seq = []
        offset_seq = 0
        for K in range(1, self.WINDOW_P_UPLIMIT + 1):
            vec_seq = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec_seq[self.k_mer_map[k_mer] - offset_seq] += 1 # vec 리스트에서 k_mer의 인덱스에 해당하는 자리에 카운트를 1 올림.
            vec_seq = np.array(vec_seq)
            offset_seq += vec_seq.size
            if self.CODING_FREQUENCY:
                vec_seq = vec_seq / vec_seq.max()
            result_seq += list(vec_seq)
        '''

        result_struct = []
        offset_struct = 0
        for K in range(1, self.WINDOW_P_STRUCT_UPLIMIT + 1):
            vec_struct = [0.0] * (self.struct_kind ** K)
            # vec배열을 3^K 개의 0.0 (float)가 담긴 배열로 초기화
            # element_number : 3
            
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer_struct = struct[i:i + K]
                vec_struct[self.k_mer_struct_map[k_mer_struct] - offset_struct] += 1
            vec_struct = np.array(vec_struct)
            offset_struct += vec_struct.size
            if self.CODING_FREQUENCY:
                vec_struct = vec_struct / vec_struct.max()
            result_struct += list(vec_struct)
            
        # sequence를 정규화한 배열과 struct를 정규화한 배열을 concatenate시킴.
        # result_seq len : 399
        # result_struct len : 39 -> 3 + 9 + 27. feature의 알파벳 개수가 3개여서 그럼.
        # 결과 : 438 
        #return np.array(result_seq + result_struct)
        return np.array(result_struct)
    

    
# encoder for RNA sequence
class RNAEncoder:
    elements = 'AUCG'
    structs = '.('

    element_number = 4
    struct_kind = 2

    def __init__(self, WINDOW_R_UPLIMIT=4, WINDOW_R_STRUCT_UPLIMIT=4, CODING_FREQUENCY=1, VECTOR_REPETITION_CNN=1,
                 TRUNCATION_LEN=None, PERIOD_EXTENDED=None):

        self.WINDOW_R_UPLIMIT = WINDOW_R_UPLIMIT
        self.WINDOW_R_STRUCT_UPLIMIT = WINDOW_R_STRUCT_UPLIMIT
        self.CODING_FREQUENCY = CODING_FREQUENCY
        self.VECTOR_REPETITION_CNN = VECTOR_REPETITION_CNN

        self.TRUNCATION_LEN = TRUNCATION_LEN
        self.PERIOD_EXTENDED = PERIOD_EXTENDED

        # list and position map for k_mer
        k_mers = ['']
        self.k_mer_list = []
        self.k_mer_map = {}
        for T in range(self.WINDOW_R_UPLIMIT):
            temp_list = []
            for k_mer in k_mers:
                for x in self.elements:
                    temp_list.append(k_mer + x)
            k_mers = temp_list
            self.k_mer_list += temp_list
        for i in range(len(self.k_mer_list)):
            self.k_mer_map[self.k_mer_list[i]] = i

        # list and position map for k_mer structure
        k_mers = ['']
        self.k_mer_struct_list = []
        self.k_mer_struct_map = {}
        for T in range(self.WINDOW_R_STRUCT_UPLIMIT):
            temp_list = []
            for k_mer in k_mers:
                for s in self.structs:
                    temp_list.append(k_mer + s)
            k_mers = temp_list
            self.k_mer_struct_list += temp_list
        for i in range(len(self.k_mer_struct_list)):
            self.k_mer_struct_map[self.k_mer_struct_list[i]] = i

        # print(len(self.k_mer_list))
        # print(self.k_mer_list)
        # print(len(self.k_mer_struct_list))
        # print(self.k_mer_struct_list)

    def encode_conjoint(self, seq):
        seq = seq.replace('T', 'U')
        seq = ''.join([x for x in seq if x in self.elements])
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'
        result = []
        offset = 0
        for K in range(1, self.WINDOW_R_UPLIMIT + 1):
            vec = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec[self.k_mer_map[k_mer] - offset] += 1
            vec = np.array(vec)
            offset += vec.size
            if self.CODING_FREQUENCY:
                vec = vec / vec.max()
            result += list(vec)
        return np.array(result)

    def encode_conjoint_struct(self, seq, struct):
        seq = seq.replace('T', 'U')
        struct = struct.replace(')', '(')
        seq_temp = []
        struct_temp = []
        for i in range(len(seq)):
            if seq[i] in self.elements:
                seq_temp.append(seq[i])
                struct_temp.append(struct[i])
        seq = ''.join(seq_temp)
        struct = ''.join(struct_temp)
        seq_len = len(seq)
        if seq_len == 0:
            return 'Error'

        result_seq = []
        offset_seq = 0
        for K in range(1, self.WINDOW_R_UPLIMIT + 1):
            vec_seq = [0.0] * (self.element_number ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer = seq[i:i + K]
                vec_seq[self.k_mer_map[k_mer] - offset_seq] += 1
            vec_seq = np.array(vec_seq)
            offset_seq += vec_seq.size
            if self.CODING_FREQUENCY:
                vec_seq = vec_seq / vec_seq.max()
            result_seq += list(vec_seq)


        result_struct = []
        offset_struct = 0
        for K in range(1, self.WINDOW_R_STRUCT_UPLIMIT + 1):
            vec_struct = [0.0] * (self.struct_kind ** K)
            counter = seq_len - K + 1
            for i in range(seq_len - K + 1):
                k_mer_struct = struct[i:i + K]
                vec_struct[self.k_mer_struct_map[k_mer_struct] - offset_struct] += 1
            vec_struct = np.array(vec_struct)
            offset_struct += vec_struct.size
            if self.CODING_FREQUENCY:
                vec_struct = vec_struct / vec_struct.max()
            result_struct += list(vec_struct)
        return np.array(result_seq + result_struct)

def preprocess_struct(pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind):
    # pair (p_sequence, r_sequence)에서 각 feature를 순회하면서 feature들의 value를 추출하고, 그것을 정규화시킨 것을 배열로 만듬.
    # p_sequence에 해당하는 p_struct를 인코딩해서 배열로 만들고, p_sequence의 값과 concatenate시킴.
    # kind = 1 (positive) or 0 (negative)인 Flag
     
    samples = []
    for pr in pairs:
        #print('kind : {0} - pair : {1} - struct : {2}'.format(kind, pr, pr[0] in pro_seqs and pr[1] in rna_seqs and pr[0] in pro_structs and pr[1] in rna_structs))
        if pr[0] in pro_seqs and pr[1] in rna_seqs and pr[0] in pro_structs and pr[1] in rna_structs:
            # 이 if 문은 결측치를 처리하기 위함임. 결측치가 포함된 pair를 단순히 제외시킨다.
            # pr[0] in pro_structs 라는게, pro_structs 딕셔너리의 key 값중 pr[0]이 포함되어있는지 보는것인 듯.
            p_seq = pro_seqs[pr[0]]  # protein sequence
            r_seq = rna_seqs[pr[1]]  # rna sequence
            p_struct = pro_structs[pr[0]]  # protein structure
            r_struct = rna_structs[pr[1]]  # rna structure

            p_conjoint = PE.encode_conjoint(p_seq) # protein sequence를 인코딩함. feature마다 count된 value를 최대값으로 나눈 정규화 사용. 그외 동일
            r_conjoint = RE.encode_conjoint(r_seq)
            p_conjoint_struct = PE.encode_conjoint_struct(p_seq, p_struct)
            # struct 파일도 sequence와 완전 동일한 방법으로 인코딩. result인, '정규화된 값으로 구성된 배열'들을 concatenate시켜서 결과로 리턴.
        
            r_conjoint_struct = RE.encode_conjoint_struct(r_seq, r_struct)

            if p_conjoint is 'Error':
                print('Skip {} in pair {} according to conjoint coding process.'.format(pr[0], pr))
            elif r_conjoint is 'Error':
                print('Skip {} in pair {} according to conjoint coding process.'.format(pr[1], pr))
            elif p_conjoint_struct is 'Error':
                print('Skip {} in pair {} according to conjoint_struct coding process.'.format(pr[0], pr))
            elif r_conjoint_struct is 'Error':
                print('Skip {} in pair {} according to conjoint_struct coding process.'.format(pr[1], pr))

            else:
                samples.append([[p_conjoint, r_conjoint],
                                [p_conjoint_struct, r_conjoint_struct],
                                kind])
        else:
            print('Skip pair {} according to sequence dictionary.'.format(pr))
            
    # samples (4차원 배열) : "[[p_conjoint, r_conjoint],[p_conjoint_struct, r_conjoint_struct], kind]" 이게 원소로 들어가있음.
    # p_conjoint, p_conjoint_struct는 feature들의 value가 정규화되어 들어가있는 1차원 배열.
    return samples



# CTF feature processing
def preprocess_feature(x, y, npz_path): # /data/npz/RPI369.npz 파일을 생성하는 함수.
# read_RPI_pairSeq(size) 의 return값 X, Y가 각각 x, y
# X : 2차원 배열 [[protein value1, rna value1], [protein value2, rna value2], ... ]
# Y : 1차원 배열. label의 배열 [1, 1, 1, ... , 0, 0, 0]
    
    def min_max_norm(a):
        a_min = np.min(a)
        a_max = np.max(a)
        return (a - a_min)/(a_max - a_min)
    
    def z_score_norm(lst):
        normalized = []
        for value in lst:
            normalized_num = (value - np.mean(lst)) / np.std(lst)
            normalized.append(normalized_num)
        return normalized
    
    def log_norm_dist(feature_dict) :
        showGraph = False
        distributized = []
        df_feature = pd.DataFrame(feature_dict, columns=feature_dict.keys(), index=[0])
        df_feature = df_feature.T

        if showGraph : 
            print('Skewness: {:05.2f}'.format(df_feature[0].skew()) , ' ' , 'Kurtosis: {:06.2f}'.format(df_feature[0].kurt()) )
            f, ax = plt.subplots(figsize = (10, 6)) 
            plt.plot(df_feature[0])
            plt.show()
        
        df_log_feature = np.log1p(df_feature)
        
        if showGraph : 
            print('Skewness: {:05.2f}'.format(df_log_feature[0].skew()) , ' ' , 'Kurtosis: {:06.2f}'.format(df_log_feature[0].kurt()) )
            f, ax = plt.subplots(figsize = (10, 6)) 
            plt.plot(df_log_feature[0])
            plt.show()

        
        #distributized.append(np.array(list(df_log_feature[0])))
        #return distributized
        return np.array(list(df_log_feature[0]))
    
    rpdict = get_reduced_protein_letter_dict()
    feature_x = []
    r_mer = 4
    r_CTF = improvedCTF(letters=["A","C","G","U"],length=r_mer)
    #r_feature_dict = r_CTF.get_feature_dict()
    
    p_mer = 3
    p_CTF = improvedCTF(letters=["A","B","C","D","E","F","G"],length=p_mer)
    #p_feature_dict = p_CTF.get_feature_dict()
    
    x_protein = []
    x_rna = []
        
    for idx, (pseq, rseq) in enumerate(x): # pseq : protein value / rseq : RNA value
        
        r_feature_dict = r_CTF.get_feature_dict()
        p_feature_dict = p_CTF.get_feature_dict()
        rpseq = []
        # 이 for loop 에서 rpdict의 규칙에 의해 각각의 알파벳을 reduced set으로 간소화시킨다 e.g. ["A","G","V"] -> "A"
        # 근데 X일 때는 그대로 저장함!!!
        for p in pseq: # MQKGNFRNQRKTVKCFNCGKEGHIAKNCRAPRKKGCWKCGKEGHQMKDCTERQANX 의 각 알파벳을 p로 받으면서 순회한다.
            if p=="X": # MQKGNFRNQRKTVKCFNCGKEGHIAKNCRAPRKKGCWKCGKEGHQMKDCTERQANX 에서 X가 포함되어 있으면 ~?
                rpseq.append(p)
            else:
                rpseq.append(rpdict[p])
                
        pseq = rpseq
        temp_pseq = ""
        for p in pseq:
            temp_pseq += p
        pseq = temp_pseq
        
        # 이 for loop 은 모든 값이 0으로 초기화된 p_feature_dict에서, 현재 protein의 패턴을 분석하면서 각 요소가 얼마나 나오는지 세는 것이다.
        for mer in range(1,p_mer+1):
            for i in range(0,len(pseq)-mer):
                pattern = pseq[i:i+mer]
                try:
                    p_feature_dict[pattern] += 1
                except:
                    continue
                #print(pattern)
        
        # 이 for-loop 도 마찬가지.
        for mer in range(1,r_mer+1):
            for i in range(0,len(rseq)-mer):
                pattern = rseq[i:i+mer]
                try:
                    r_feature_dict[pattern] += 1
                except:
                    continue
                #print(pattern)
        
        # 이건 다 우연이고, 데이터셋의 개수에 따라서 달라지는거임!!
        # 여기서 p_feature 이나 r_feature를 출력해보면 항상 무슨 데이터셋을 쓰든 739번 출력되는데, 739번 출력되는 이유 : 
        # 각각 p_feature의 알파벳 조합 개수(399) + r_feature의 알파벳 조합 개수(340) 이다. 
        # p_feature은 7개의 changed_letter 에서 최대 3자리의 중복순열을 하는 개수로, 7 + 49 + 343 = 399이다.
        #p_feature = np.array(list(p_feature_dict.values()))
        p_feature = log_norm_dist(p_feature_dict)
        p_feature = min_max_norm(p_feature) #각각의 최소값을 0, 최대값을 1로 해서 그 사이값을 소수로 나타내는 것이다
        #p_feature = z_score_norm(p_feature)
        
        #r_feature = np.array(list(r_feature_dict.values()))
        r_feature = log_norm_dist(r_feature_dict)
        r_feature = min_max_norm(r_feature)
        #r_feature = z_score_norm(r_feature)
        
        
        x_protein.append(p_feature)
        x_rna.append(r_feature)
        
        if isPrint : 
            print("CTF preprocessing ({} / {})".format(idx+1, len(x)))
            #print(r_feature)
            
                
    
    x_protein = np.array(x_protein)
    x_rna = np.array(x_rna)
    y = np.array(y)
    #np.savez(npz_path,XP=x_protein, XR=x_rna, Y=y)
    
    if isPrint :
        print("Protein feature : {}".format(x_protein.shape))
        print("RNA feature     : {}".format(x_rna.shape))
        print("Labels          : {}".format(y.shape))
        print("Saved path      : {}".format(npz_path))
    
    # x_protein : 이차원 배열. 각 원소는 1차원 배열이고 protein value에 대해 feature를 추출한 것을 최소최대정규화 한 것이다.
    # x_protein eg. [ [0.51851852 0.03703704 0.         ... 0.         0.         0.        ], [0.74916388 0.09364548 0.01672241 ... 0.         0.         0.        ], ..., ] 각각의 원소(배열)의 길이는 모두 399이다. 왜??
    # [모든 원소의 길이가 399인 이유] protein 서열의 길이와 상관없이, 추출하려는 feature 항목의 개수는 동일하게 399이기 때문.
    # 그래서 protein 서열 길이가 짧으면 feature에 체크되는 항목 개수가 적을 것이고, 길이가 길면, 항목 개수가 많을 것이다.
    
    # 그래서 x_protein의 행은 각각의 protein 서열이고, 세로는, 각각의 feature을 의미한다.
    return x_protein, x_rna, y

def preprocess_and_savez_NPInter():
    X, Y = read_NPInter_pairSeq()
    #XP, XR, Y = preprocess_feature(X, Y, NPZ_PATH["NPInter"])
    #XP, XR, Y = preprocess_feature(X, Y, Z_NPZ_PATH["NPInter"])
    #XP, XR, Y = preprocess_feature(X, Y, LOG_NPZ_PATH["NPInter"])
    #XP, XR, Y = preprocess_feature(X, Y, NPZ_PATH["RPI"][size])
   
    pos_pairs, neg_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs = read_RPI_pairStruct()
    
    PE = ProEncoder()
    RE = RNAEncoder()

    print("Coding positive protein-rna pairs.\n")
    samples = coding_pairs(pos_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind=1)
    positive_sample_number = len(samples)
    #print("positive_sample_number : {}".format(positive_sample_number))
    print("Coding negative protein-rna pairs.\n")
    samples += coding_pairs(neg_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind=0)
    negative_sample_number = len(samples) - positive_sample_number
    sample_num = len(samples)

    XP_struct, XR_struct, Y_struct = pre_process_data(samples=samples)
    print('XP_struct : {0} - len : {1}'.format(XP_struct, len(XP_struct)))
    print('XR_struct : {0} - len : {1}'.format(XR_struct, len(XR_struct)))
    print('Y_struct : {0} - len : {1}'.format(Y_struct, len(Y_struct)))
    np.savez(NPZ_PATH_STRUCT["NPInter"],XP=XP_struct, XR=XR_struct, Y=Y_struct)
    
def preprocess_and_savez_RPI(size):
    X, Y = read_RPI_pairSeq(size)
    #XP, XR, Y = preprocess_feature(X, Y, NPZ_PATH["RPI"][size])
    #XP, XR, Y = preprocess_feature(X, Y, Z_NPZ_PATH["RPI"][size])
    #XP, XR, Y = preprocess_feature(X, Y, LOG_NPZ_PATH["RPI"][size])
    #XP, XR, Y = preprocess_feature(X, Y, NPZ_PATH["RPI"][size])
    
    pos_pairs, neg_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs = read_RPI_pairStruct(size)
    PE = ProEncoder()
    RE = RNAEncoder()

    print("Coding positive protein-rna pairs.\n")
    samples = coding_pairs(pos_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind=1)
    positive_sample_number = len(samples)
    #print("positive_sample_number : {}".format(positive_sample_number))
    print("Coding negative protein-rna pairs.\n")
    samples += coding_pairs(neg_pairs, pro_seqs, rna_seqs, pro_structs, rna_structs, PE, RE, kind=0)
    negative_sample_number = len(samples) - positive_sample_number
    sample_num = len(samples)

    XP_struct, XR_struct, Y_struct = pre_process_data(samples=samples)
    print('XP_struct : {0} - len : {1}'.format(XP_struct, len(XP_struct)))
    print('XR_struct : {0} - len : {1}'.format(XR_struct, len(XR_struct)))
    print('Y_struct : {0} - len : {1}'.format(Y_struct, len(Y_struct)))
    np.savez(NPZ_PATH_STRUCT["RPI"][size],XP=XP_struct, XR=XR_struct, Y=Y_struct)
    
if __name__ == "__main__":
    print("Feature Preprocessing")
    preprocess_and_savez_NPInter()
    preprocess_and_savez_RPI(1807)
    preprocess_and_savez_RPI(2241)
    #preprocess_and_savez_RPI(369)
    preprocess_and_savez_RPI(488)
    
    
    
    

Feature Preprocessing
path : data/sequence/NPInter_protein_seq.fa
path : data/sequence/NPInter_rna_seq.fa
path : data/structure/NPInter_protein_struct.fa
path : data/structure/NPInter_rna_struct.fa
Coding positive protein-rna pairs.

Coding negative protein-rna pairs.

XP_struct : [[-0.7924834  -0.38717115  0.40931546 ... -0.96959493 -0.61340464
   0.46901026]
 [-0.13963405  0.82894207  0.40931546 ...  0.45379039  0.86687671
   0.46901026]
 [-2.33624152 -1.6987582   0.40931546 ... -2.25343191 -1.49029779
   0.46901026]
 ...
 [-0.81644895 -1.47079798  0.40931546 ... -1.61679573 -1.09983021
   0.46901026]
 [-0.8955237   0.11428451  0.40931546 ... -0.67935813  0.93596182
   0.46901026]
 [ 1.27430704 -0.44612334 -0.51045904 ...  0.62790677 -0.2603459
  -0.98197126]] - len : 20824
XR_struct : [[-2.64598567  0.61865614 -2.1609111  ...  0.18658295 -0.48315251
   0.198021  ]
 [-0.38802938 -1.65842822  0.90355245 ...  2.06274384  0.3393341
   0.198021  ]
 [-0.14980064  0.61865614 -0.73668564 ..