## ENCODE SEQUENCE

In [1]:
import pandas as pd
from Bio.SeqIO import parse
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq

from sklearn.preprocessing import LabelEncoder

import numpy as np
import re

In [2]:
path="final_updated_classes.csv"

data_df=pd.read_csv(path)

data_df.head(1)

Unnamed: 0,Accession,Release_Date,Species,Length,Geo_Location,Host,Isolation_Source,Collection_Date,Sequence
0,MT683386,2020-07-01T00:00:00Z,Severe acute respiratory syndrome-related coro...,29858,USA,Homo sapiens,,2020-04-05,GGTTTATACCTTCCCAGGTAACAAACCAACCAACTTTCGATCTCTT...


function to convert a **DNA sequence** string to a **numpy array**

converts to lower case, changes any non *'acgt'** characters to **'n'**

In [3]:
def string_to_array(my_string):
    my_string = my_string.lower()
    my_string = re.sub('[^acgt]', 'z', my_string)
    my_array = np.array(list(my_string))
    return my_array

create a label encoder with **'acgtn'** alphabet

In [4]:
label_encoder = LabelEncoder()
label_encoder.fit(np.array(['a','c','g','t','z']))

LabelEncoder()

function to encode a DNA sequence string as an **ordinal vector**

returns a **numpy vector** with **a=0.25, c=0.50, g=0.75, t=1.00, n=0.00**

In [5]:
def ordinal_encoder(my_array):
    
    integer_encoded = label_encoder.transform(my_array)
    float_encoded = integer_encoded.astype(float)
    float_encoded[float_encoded == 0] = 0.25 # A
    float_encoded[float_encoded == 1] = 0.50 # C
    float_encoded[float_encoded == 2] = 0.75 # G
    float_encoded[float_encoded == 3] = 1.00 # T
    float_encoded[float_encoded == 4] = 0.00 # anything else, z
    
    return float_encoded

Creating **final_data** dataframe with **'encoded-sequence'** and **'Geo_location'** columns

In [6]:
data=data_df[["Sequence","Geo_Location"]]
data=data[data["Sequence"].notna()]

dummy=[]
dum=np.array(dummy)

form={"inp_seq":dum}

seq_df = pd.DataFrame (form, columns = ['inp_seq'])

seq_list=[]

for idx, seq in enumerate(list(data["Sequence"])):
    arr=ordinal_encoder(string_to_array(seq))
    seq_list.append(arr)
    
seq_df["inp_seq"]=seq_list

final_data= data.assign(enc_seq=seq_df)

final_data=final_data[["enc_seq","Geo_Location"]]

final_data

Unnamed: 0,enc_seq,Geo_Location
0,"[0.75, 0.75, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0...",USA
1,"[0.25, 1.0, 0.25, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5...",USA
2,"[0.25, 0.25, 0.75, 0.75, 1.0, 1.0, 1.0, 0.25, ...",USA
3,"[0.75, 0.75, 1.0, 0.25, 0.25, 0.5, 0.25, 0.25,...",USA
4,"[0.75, 0.5, 1.0, 1.0, 0.25, 0.5, 0.75, 0.75, 1...",USA
...,...,...
5592,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",California
5593,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",California
5594,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",Others
5595,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",Others


Function to return max length of sequence in final_data:

In [7]:
def get_maxLen(enc_seq):
    
    max=0
    for row in enc_seq:
        #print(type(row))
        if(len(row)>max):
            max=len(row)
    
    
    return max

Function to append trailing zeros to each seq vector

In [8]:
def append_arr(enc_seq, max_len):
    
    seq_l=list(enc_seq)
    for i in range(len(seq_l),max_len):
        seq_l.append(0)
        
    new_seq_ar=np.array(seq_l)

        
    return new_seq_ar

Padding the encoded seq with trailing zeros so all vectors are of same dimension

In [9]:
max_len=get_maxLen(final_data["enc_seq"])
if max_len%2 != 0:
    max_len+=1
print("max_len is",max_len)

padded_seq_list=[]

for index, row in final_data.iterrows():
    seq_ar=append_arr(row["enc_seq"],max_len)
    padded_seq_list.append(seq_ar)
#     if(len(seq_ar) not in list_len):
#         list_len.append(len(seq_ar))
#    row["padded_encSeq"]=seq_ar


dummy=[]
dum=np.array(dummy)
form={"padded_enc_seq":dum}
padded_seq_df = pd.DataFrame (form, columns = ['padded_enc_seq'])

padded_seq_df["padded_enc_seq"]=padded_seq_list

padded_final_data= final_data.assign(padded_enc_seq=padded_seq_df)

padded_final_data=padded_final_data[["padded_enc_seq","Geo_Location"]]

padded_final_data.head(5)

max_len is 29922


Unnamed: 0,padded_enc_seq,Geo_Location
0,"[0.75, 0.75, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0...",USA
1,"[0.25, 1.0, 0.25, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5...",USA
2,"[0.25, 0.25, 0.75, 0.75, 1.0, 1.0, 1.0, 0.25, ...",USA
3,"[0.75, 0.75, 1.0, 0.25, 0.25, 0.5, 0.25, 0.25,...",USA
4,"[0.75, 0.5, 1.0, 1.0, 0.25, 0.5, 0.75, 0.75, 1...",USA


In [10]:
one_hot_encodings= pd.get_dummies(padded_final_data["Geo_Location"])
padded_final_data = padded_final_data.drop("Geo_Location",axis=1)
padded_final_data = padded_final_data.join(one_hot_encodings)
padded_final_data

Unnamed: 0,padded_enc_seq,California,Connecticut,Florida,King WA,Masachusetts,Michigan,New York,Others,Rest of Washington,Snohomish_Pierce WA,USA,Virginia,Yakima WA
0,"[0.75, 0.75, 1.0, 1.0, 1.0, 0.25, 1.0, 0.25, 0...",0,0,0,0,0,0,0,0,0,0,1,0,0
1,"[0.25, 1.0, 0.25, 0.5, 0.5, 1.0, 1.0, 0.5, 0.5...",0,0,0,0,0,0,0,0,0,0,1,0,0
2,"[0.25, 0.25, 0.75, 0.75, 1.0, 1.0, 1.0, 0.25, ...",0,0,0,0,0,0,0,0,0,0,1,0,0
3,"[0.75, 0.75, 1.0, 0.25, 0.25, 0.5, 0.25, 0.25,...",0,0,0,0,0,0,0,0,0,0,1,0,0
4,"[0.75, 0.5, 1.0, 1.0, 0.25, 0.5, 0.75, 0.75, 1...",0,0,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5592,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",1,0,0,0,0,0,0,0,0,0,0,0,0
5593,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",1,0,0,0,0,0,0,0,0,0,0,0,0
5594,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",0,0,0,0,0,0,0,1,0,0,0,0,0
5595,"[0.25, 1.0, 1.0, 0.25, 0.25, 0.25, 0.75, 0.75,...",0,0,0,0,0,0,0,1,0,0,0,0,0


In [11]:
X = np.array(padded_final_data["padded_enc_seq"])
Y = np.array(one_hot_encodings)
print(X.shape)
print(Y.shape)

(5597,)
(5597, 13)


Function to reshape 1D seq vectors to 2D vectors according to given parameters

In [12]:
def reshape_seq(seq, m ,n):
    
    r_seq=np.reshape(seq,(m,n))
    
    return r_seq

re_seqList=[]

for index, row in padded_final_data.iterrows():
    seq_ar=reshape_seq(row["padded_enc_seq"],6,4987)
    re_seqList.append(seq_ar)

Convert to desired shapes

In [13]:
X = np.asarray(re_seqList)
X = np.reshape(X, (X.shape[0], X.shape[1], X.shape[2], 1))
X.shape

(5597, 6, 4987, 1)

In [14]:
Y = padded_final_data['Geo_Location'].values
Y.shape

(5597,)

In [13]:
np.save('X_data', X)
np.save('Y_data', Y)