In [1]:
import pandas as pd
import torch
import numpy 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from matplotlib.pylab import plt
from numpy import arange

In [2]:
df = pd.read_csv("../protein_embeddings/data/rbp.csv")

df = df[['Nucleotide Sequence', 'Host']]

df

Unnamed: 0,Nucleotide Sequence,Host
0,GTGCATCAAAATATTTCAAAGGAGAATCGTGGAAACTATAACAACG...,microcystis
1,GTGAGGATTTTTTATATCCACCATCCATTCCTCGCTACTCACCGAT...,microcystis
2,TTGCTGACAGATGTCGATATTCAGGCATTAATTGATGCCTCAATTT...,microcystis
3,TTGTTCGGAGTTTTTATCGTGAGGCGTGAAGGCGGCTATATCGGAA...,microcystis
4,TTGGTTAATTATCGTTATAGATTATCACGACTACTAATCCCGGGGG...,microcystis
...,...,...
24747,ATGACAACTCGCGCCAGTGCGTCGAGACTCTCGGTCACCCAGGTCC...,mycolicibacterium
24748,ATGACCGAGCCGATCAATGGGCCCGAGGTCTACTCGCAGAAGCTCG...,mycolicibacterium
24749,ATGGCTAACGTAATTAAAACCGTTTTGACTTACCAGTTAGATGGCT...,cedecea
24750,ATGTACAAGATTATCAACAGTTCAGCAGCGGCTAACCAAGGAGACT...,plectonema


In [3]:
df["Nucleotide Sequence"]

0        GTGCATCAAAATATTTCAAAGGAGAATCGTGGAAACTATAACAACG...
1        GTGAGGATTTTTTATATCCACCATCCATTCCTCGCTACTCACCGAT...
2        TTGCTGACAGATGTCGATATTCAGGCATTAATTGATGCCTCAATTT...
3        TTGTTCGGAGTTTTTATCGTGAGGCGTGAAGGCGGCTATATCGGAA...
4        TTGGTTAATTATCGTTATAGATTATCACGACTACTAATCCCGGGGG...
                               ...                        
24747    ATGACAACTCGCGCCAGTGCGTCGAGACTCTCGGTCACCCAGGTCC...
24748    ATGACCGAGCCGATCAATGGGCCCGAGGTCTACTCGCAGAAGCTCG...
24749    ATGGCTAACGTAATTAAAACCGTTTTGACTTACCAGTTAGATGGCT...
24750    ATGTACAAGATTATCAACAGTTCAGCAGCGGCTAACCAAGGAGACT...
24751    ATGTACAAGATTATCAACAGTTCAGCAGCAGCTAACCAAGAAGACT...
Name: Nucleotide Sequence, Length: 24752, dtype: object

In [4]:
value_counts = df["Host"].value_counts()

filtered_rows = []
others = []

for index, row in df.iterrows():
    if value_counts[row["Host"]]>=32:
        filtered_rows.append(row)
    else:
        others.append(row)

df = pd.DataFrame(filtered_rows)
others = pd.DataFrame(others)

In [5]:
label_encoder = LabelEncoder()
df['Host'] = label_encoder.fit_transform(df['Host'])
others["Host"] = -1
df["Host"].value_counts()

Host
19    4316
46    2106
54    1737
40    1709
55    1542
24    1323
18     953
31     826
50     812
5      679
42     585
49     475
16     472
9      414
27     333
1      328
47     323
2      318
57     311
52     271
43     248
33     238
56     194
21     176
36     175
14     173
17     163
20     161
53     144
41     141
51     138
37     132
8      129
13     120
44     113
11     112
30     109
34     109
6      103
29      95
4       87
10      85
12      79
26      77
35      61
0       58
39      56
32      54
38      54
23      52
45      45
22      42
3       41
48      37
28      34
25      34
15      32
7       32
Name: count, dtype: int64

In [6]:
tr_df, te_df = train_test_split(df, test_size=0.3, stratify=df["Host"], random_state=42)

In [7]:
max_len = 0
min_len = 1e9
for i in df["Nucleotide Sequence"]:
    max_len = max(max_len, len(i))
    min_len = min(min_len, len(i))
max_len, min_len

(4761, 96)

In [2]:
import numpy as np
def onehote(sequence):
    mapping = {"A": 0, "C": 1, "G": 2, "T": 3}
    seq2 = []

    for i in sequence:
        if i in mapping.keys():
            seq2.append(mapping[i])

    return np.eye(4)[seq2]

dna='ATTTACGGATTGCTGA'
#calling onehote function
oneHotEncodedDna= onehote(dna)
print(oneHotEncodedDna)

[[1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 1. 0.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 1. 0.]
 [1. 0. 0. 0.]]


In [9]:
df["Host"]

27       19
28       19
29       19
30       19
31       19
         ..
24676    25
24677    25
24678    25
24679    25
24680    25
Name: Host, Length: 23766, dtype: int64

In [11]:
onehot_tr = []

for i,j in zip(tr_df["Nucleotide Sequence"], tr_df["Host"]):
    l = list(onehote(i))
    while len(l)<max_len:
        l.append([0,0,0,0])
    onehot_tr.append([np.concatenate(l), j])

onehot_tr

[[array([1., 0., 0., ..., 0., 0., 0.]), 31],
 [array([1., 0., 0., ..., 0., 0., 0.]), 40],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 52],
 [array([1., 0., 0., ..., 0., 0., 0.]), 40],
 [array([0., 0., 1., ..., 0., 0., 0.]), 30],
 [array([1., 0., 0., ..., 0., 0., 0.]), 2],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([0., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1., 0., 0., ..., 0., 0., 0.]), 51],
 [array([1., 0., 0., ..., 0., 0., 0.]), 5],
 [array([1., 0., 0., ..., 0., 0., 0.]), 52],
 [array([1., 0., 0., ..., 0., 0., 0.]), 50],
 [array([1., 0., 0., ..., 0., 0., 0.]), 9],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1., 0., 0., ..., 0., 0., 0.]), 42],
 [array([1., 0., 0., ..., 0., 0., 0.]), 40],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([0., 0., 1., ..., 0., 0., 0.]), 16],
 [array([1., 0., 0., ..., 0., 0., 0.]), 1],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1., 0., 0., ..., 0., 0., 0.]), 43],
 [array([1., 0

In [12]:
te_df = pd.concat([te_df, others], axis=0)

In [13]:
onehot_te = []

for i,j in zip(te_df["Nucleotide Sequence"],te_df["Host"]):
    l = list(onehote(i))
    while len(l)<max_len:
        l.append([0,0,0,0])
    onehot_te.append([np.concatenate(l), j])

onehot_te

[[array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([0., 0., 1., ..., 0., 0., 0.]), 24],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 16],
 [array([1., 0., 0., ..., 0., 0., 0.]), 44],
 [array([1., 0., 0., ..., 0., 0., 0.]), 1],
 [array([1., 0., 0., ..., 0., 0., 0.]), 24],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1., 0., 0., ..., 0., 0., 0.]), 49],
 [array([1., 0., 0., ..., 0., 0., 0.]), 50],
 [array([1., 0., 0., ..., 0., 0., 0.]), 55],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([0., 0., 1., ..., 0., 0., 0.]), 24],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 13],
 [array([1., 0., 0., ..., 0., 0., 0.]), 57],
 [array([1., 0., 0., ..., 0., 0., 0.]), 46],
 [array([1., 0., 0., ..., 0., 0., 0.]), 42],
 [array([1., 0., 0., ..., 0., 0., 0.]), 19],
 [array([1.

In [23]:
onehot_tr = np.asarray(onehot_tr, dtype="object")
np.save("./onehot_tr.npy", onehot_tr)

In [24]:
onehot_te = np.asarray(onehot_te, dtype="object")
np.save("./onehot_te.npy", onehot_te)

In [5]:
data = np.load("./onehot_te.npy", allow_pickle=True)

array([array([1., 0., 0., ..., 0., 0., 0.]), -1], dtype=object)