In [5]:
#packages for reading data from the database and storing them in a pandas.DataFrame

import pandas as pd
import sqlite3

In [6]:
path_to_big_db = '/home/valentin-rexer/uni/UofM/datascience/datasets/swissprot.dat.db'

In [7]:
# each EC number level is a seperate column in our dataframe 
# therefore, we can extract trianing data for a class of our choice from the same df

out_columns=['EC1', 'EC2', 'EC3', 'EC4', 'Sequence']
out_db_df = pd.DataFrame(columns=out_columns)

In [8]:
#retireve the EC_numbers and Sequences from the database

big_db_df = pd.read_sql_query(f'SELECT EC_Number, Sequence FROM sw_table', sqlite3.connect(path_to_big_db))

In [9]:
# iterate over all tuples and split the EC numbers into four single columns
import re

out_rows = []

for row in big_db_df.itertuples():
    current_ec = row[1].replace('EC_Number=', '').replace('-','0')
    current_seq = row[2].replace('Sequence=', '')
    current_seq = re.sub(r"[UZOB]", "X", current_seq)
    
    new_db_entry = []

    for level in current_ec.split('.'):
        new_db_entry.append(int(level))

    new_db_entry.append(current_seq)
    out_rows.append(dict(zip(out_columns, new_db_entry)))


In [10]:
out_df = pd.DataFrame(out_rows)
out_df

Unnamed: 0,EC1,EC2,EC3,EC4,Sequence
0,3,6,4,0,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...
1,2,7,11,1,MATNYCDEFERNPTRNPRTGRTIKRGGPVFRALERECSDGAARVFP...
2,2,7,11,0,MPLSVFAEEFAEKSVKRYIGQGLWLPCNLSDYYYYQEFHDEGGYGS...
3,3,6,4,0,MAKLLRLNAIDGDMPGAGEADLFTLAPGGKAYVPFAWGSRVLGCKP...
4,3,6,1,0,MDNETSTPDIFQWCVSPFSKITLKRSMEQRDIVEFRIDATILRQIF...
...,...,...,...,...,...
276668,3,4,19,12,MLSCDICGETVTSEPDMKAHLIVHMENEIVCPFCKLSGVSYDEMCF...
276669,3,4,19,12,MLSCNICGETVNSEPDMKAHLIVHMENEIICPFCKLSGINYNEICF...
276670,3,4,19,12,MLSCDICGETVTSEPDRKAHLIVHMENEIICPFCKLSGINYNEMCF...
276671,2,7,11,1,MSGGKSGTKLSSFQNLQQIGQGGFGVVYSAQRENGEKVAIKKIGNA...


In [11]:
# create a dataframe for the level 1 EC number prediction

level_1_df = out_df[['EC1', 'Sequence']]
level_1_df

Unnamed: 0,EC1,Sequence
0,3,MDTSPYDFLKLYPWLSRGEADKGTLLDAFPGETFEQSLASDVAMRR...
1,2,MATNYCDEFERNPTRNPRTGRTIKRGGPVFRALERECSDGAARVFP...
2,2,MPLSVFAEEFAEKSVKRYIGQGLWLPCNLSDYYYYQEFHDEGGYGS...
3,3,MAKLLRLNAIDGDMPGAGEADLFTLAPGGKAYVPFAWGSRVLGCKP...
4,3,MDNETSTPDIFQWCVSPFSKITLKRSMEQRDIVEFRIDATILRQIF...
...,...,...
276668,3,MLSCDICGETVTSEPDMKAHLIVHMENEIVCPFCKLSGVSYDEMCF...
276669,3,MLSCNICGETVNSEPDMKAHLIVHMENEIICPFCKLSGINYNEICF...
276670,3,MLSCDICGETVTSEPDRKAHLIVHMENEIICPFCKLSGINYNEMCF...
276671,2,MSGGKSGTKLSSFQNLQQIGQGGFGVVYSAQRENGEKVAIKKIGNA...


In [12]:
from gensim.models import FastText

sequences = level_1_df['Sequence']

aa_model = FastText(sequences, vector_size=6, window=7, min_count=10)
    

In [13]:
print(aa_model.wv['M'])
print(aa_model.wv['G'])

[ 0.22589695 -0.15022601 -0.70326626  0.3192943  -1.6072931  -0.4099239 ]
[-0.95430756 -1.1245557  -0.22416764  1.6976514  -1.9886826  -0.01604706]


In [15]:
aa_model.save('/home/valentin-rexer/uni/UofM/datascience/fast_text.bin')

In [17]:
from gensim.models import FastText

aa_model = FastText.load('/home/valentin-rexer/uni/UofM/datascience/fast_text.bin')

def get_embedded_vector(sequence):
    sequence = list(sequence)
    vector = [aa_model.wv[amino_acid] for amino_acid in sequence]

    return vector

In [18]:
labels = []
sequences =[]

for entry in level_1_df.itertuples():

    #only process sequences with len 200 or lower for now
    if len(entry[2]) > 1000:
        continue
    
    labels.append(entry[1])
    sequences.append(get_embedded_vector(entry[2]))

In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

sequences = pad_sequences(sequences, padding='post', dtype='float32')

In [21]:
import numpy as np

X = np.array(sequences)
y = np.array(labels)

print(X.shape, y.shape)

(267286, 1000, 6) (267286,)


In [22]:
np_array_store_file = f'/home/valentin-rexer/uni/UofM/datascience/datasets/len_500_data.npz'

In [23]:
np.savez(np_array_store_file, X=X, y=y)