# Required libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import itertools

# Input data manipulation

In [2]:
#read file
with open('DATA_b/x_RBM_q0.1.dat', 'r') as file:
    file_proteins=file.read().splitlines() 

#convert to string of 0 and 1
protein_list=[]
for line in file_proteins:
    protein_list.append(''.join(line.split(' ')))

#show only first 10 converted proteins
protein_list[0:10]

['00101000001001000010',
 '00101000000110000001',
 '00100100000101000001',
 '01000001010000101000',
 '00100010100000100100',
 '00010100001010000010',
 '00100100001001000100',
 '00101000000110000001',
 '10000010010001001000',
 '00011000001001000010']

# Definition of functions

In [3]:
def convert_to_decimal_protein(protein_list):
    '''
    Convert the raw protein list of strings of 1s and 0s 
    to binary numbers
    '''
    proteins_dataset=[]

    for line in protein_list:
        proteins_dataset.append(int(line,2))
        
    return proteins_dataset

In [4]:
def return_protein_probabilities(decimal_proteins_list, n_amminoacids=5):
    '''
    Returns a pdf. Each element of the returned vector
    represents the probability of each one of the 4^5 proteins
    in the list passed to the function.
    
    The output vector length is always 4^5=1024, independently of
    the input proteins list passed as input.
    '''
    #number of amminoacids in the protein
    n_ammino=n_amminoacids
    
    #total number of preteins, to normalize the pdf
    n_proteins=len(decimal_proteins_list)

    #encoding of amminoacids
    v1,v2,v3,v4=np.eye(4)

    #list all possible combinations of proteins
    possibilities=list(itertools.product(['1000','0100','0010','0001'],
                                         repeat=n_ammino))

    #converts proteins to decimal representation
    decimal=list()
    for p in possibilities:
        decimal.append(int(''.join(p),2))

    #sort the representation from bigger to smaller values
    decimal.sort(reverse=True)

    #initialize a dict with proteins as keys
    protein_cnt=dict.fromkeys(decimal,0)

    #counts the number of times the protein was found
    for protein in decimal_proteins_list:
       protein_cnt[protein]+=1 

    #return the ordered list of 
    #normalized probabilities for each possible protein
    norm_prob=np.array(list(protein_cnt.values()))/n_proteins
    
    return norm_prob

In [5]:
proteins_dataset=convert_to_decimal_protein(protein_list)
p_dataset=return_protein_probabilities(proteins_dataset)

print('Len protein dataset: ',len(proteins_dataset))
print('Len p_dataset: ', len(p_dataset))
print('Norm p_dataset: ', p_dataset.sum())

# Small tests

In [6]:
proteins=[]
with open('DATA_b/x_RBM_q0.1.dat', 'r') as file:
    lines=file.readlines()
    
#proteins.append(int(''.join(file.readline().split(' ')),2))
for line in lines:
    #print(int(''.join(line.strip().split(' ')),2))
    proteins.append(int(''.join(line.strip().split(' ')),2))

proteins=np.array(proteins)
counted,counts=np.unique(proteins, return_counts=True)#.size
print(counts[counts>50])

#plt.hist(proteins, bins=625)

[149 104 114 141 124 125 135 114 103 116 129 113 139 133 123 132 133 107
 132 138 112 139 103 138 104 111 130 118 115 123 112 105 100 120 135 128
 119 128 120 120 126 120 135 133 123 108 122 124 111 127 111 108 109 119
 121 122 132 139 120 121 106 112  96 111]
