# Predição da Localização Subcelular de Proteínas
### Vítor Amorim Fróis

In [64]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss

# Rede Neural
Vamos criar uma classe `SimpleNN` para ambos casos. A classe irá possuir métodos para treinar uma rede sobre um conjunto de treinamento especificado

In [73]:
import os
os.environ["KERAS_BACKEND"] = "jax"
import keras

class SimpleNN:
    def __init__(self, input_size: int, output_size: int):
        self.model = keras.Sequential(
            [
                keras.layers.Input(shape=(input_size,)),
                keras.layers.Dense(128, activation="sigmoid"),
                keras.layers.Dense(output_size, activation="sigmoid"),
            ]
        )
    def compile(self):
        self.model.compile(
            loss=keras.losses.BinaryCrossentropy(),
            optimizer=keras.optimizers.RMSprop(),
        )

    def fit(self, X, y, batch_size: int = 10, epochs: int = 10):
        self.model.fit(
            X,
            y,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.15,
            callbacks=[keras.callbacks.EarlyStopping(patience=2)]
        )

    def evaluate(self, X, y):
        return self.model.evaluate(X, y, verbose=0)

    def predict(self, X):
        return self.model.predict(X)

# Vírus
O conjunto de dados possui seis localizações subcelulares: Proteínas do Capsídeo Viral, Proteínas da Membrana Celular do Hospedeiro, Proteínas do Retículo Endoplasmático do Hospedeiro, Proteínas do Citoplasma do Hospedeiro, Proteínas do Núcleo do Hospedeiro e Proteínas Secretadas. 

### Leitura do Dataset

In [74]:
virus_df = pd.read_csv('data/Virus_Dataset_Term_Frequency.tsv', delimiter='\t')
targets = virus_df.columns[-6:]
features = [i for i in virus_df.columns if i.isdigit()]

### Split treino e teste

In [75]:
X = virus_df[features]
y = virus_df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print(X_test.shape[0], "test samples")
print(X_train.shape[0], "train samples")

### Instancia e treina rede neural

In [77]:
model = SimpleNN(len(features), len(targets))
model.compile()
model.fit(X_train, y_train, epochs=50)

Epoch 1/50


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.6117 - val_loss: 0.4826
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4489 - val_loss: 0.4370
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4003 - val_loss: 0.4185
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3589 - val_loss: 0.4052
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.3560 - val_loss: 0.3909
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 0.3401 - val_loss: 0.3695
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.3128 - val_loss: 0.3583
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3047 - val_loss: 0.3466
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

### Testes da rede

In [78]:
model.evaluate(X_test, y_test)

0.10009607672691345

In [79]:
predictions = np.array(model.predict(X_test))

threshold = 0.5
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0

print(f'Hamming Loss: {hamming_loss(predictions, y_test)}')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 67ms/step
Hamming Loss: 0.03225806451612903


# Plantas
O conjunto de dados possui seis localizações subcelulares: ? 

### Leitura do Dataset

In [91]:
plants_df = pd.read_csv('data/Plants_Dataset_Term_Frequency.tsv', delimiter='\t')
targets = plants_df.columns[-12:]
features = [i for i in plants_df.columns if i.isdigit()]

### Split treino e teste

In [92]:
X = plants_df[features]
y = plants_df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print(X_test.shape[0], "test samples")
print(X_train.shape[0], "train samples")

X_train shape: (676, 1830, 1)
y_train shape: (676, 12)
290 test samples
676 train samples


### Instancia e treina rede neural

In [93]:
model = SimpleNN(len(features), len(targets))
model.compile()
model.fit(X_train, y_train, epochs=50)

Epoch 1/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 0.3485 - val_loss: 0.2403
Epoch 2/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.2229 - val_loss: 0.2112
Epoch 3/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1919 - val_loss: 0.1850
Epoch 4/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1597 - val_loss: 0.1618
Epoch 5/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1487 - val_loss: 0.1432
Epoch 6/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1233 - val_loss: 0.1258
Epoch 7/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1122 - val_loss: 0.1128
Epoch 8/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0993 - val_loss: 0.1019
Epoch 9/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

### Testes da rede

In [94]:
model.evaluate(X_test, y_test)

0.054967768490314484

In [95]:
predictions = np.array(model.predict(X_test))

threshold = 0.5
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0

print(f'Hamming Loss: {hamming_loss(predictions, y_test)}')

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Hamming Loss: 0.014367816091954023
