# Predição da Localização Subcelular de Proteínas
### Vítor Amorim Fróis

In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss
from sklearn.preprocessing import StandardScaler

# Rede Neural
Vamos criar uma classe `SimpleNN` para ambos casos. A classe irá possuir métodos para treinar uma rede sobre um conjunto de treinamento especificado

In [120]:
import os
os.environ["KERAS_BACKEND"] = "jax"
import keras

class SimpleNN:
    def __init__(self, input_size: int, output_size: int):
        self.model = keras.Sequential(
            [
                keras.layers.Input(shape=(input_size,)),
                keras.layers.Dense(128, activation="sigmoid"),
                keras.layers.Dense(output_size, activation="sigmoid"),
            ]
        )
    def compile(self):
        self.model.compile(
            loss=keras.losses.BinaryCrossentropy(),
            optimizer=keras.optimizers.RMSprop(),
        )

    def fit(self, X, y, batch_size: int = 10, epochs: int = 10):
        self.model.fit(
            X,
            y,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=0.15,
            callbacks=[keras.callbacks.EarlyStopping(patience=2)]
        )

    def evaluate(self, X, y):
        return self.model.evaluate(X, y, verbose=0)

    def predict(self, X):
        return self.model.predict(X)

# Vírus
O conjunto de dados possui seis localizações subcelulares: Proteínas do Capsídeo Viral, Proteínas da Membrana Celular do Hospedeiro, Proteínas do Retículo Endoplasmático do Hospedeiro, Proteínas do Citoplasma do Hospedeiro, Proteínas do Núcleo do Hospedeiro e Proteínas Secretadas. 

### Leitura do Dataset

In [137]:
virus_df = pd.read_csv('data/Virus_Dataset_Term_Frequency.tsv', delimiter='\t')
targets = virus_df.columns[-6:]
features = [i for i in virus_df.columns if i.isdigit()]

### Split treino e teste

In [138]:
X = virus_df[features]
y = virus_df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print(X_test.shape[0], "test samples")
print(X_train.shape[0], "train samples")

X_train shape: (144, 362)
y_train shape: (144, 6)
62 test samples
144 train samples


### Normalização dos dados

In [140]:
scaler = StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)

### Instancia e treina rede neural

In [141]:
model = SimpleNN(len(features), len(targets))
model.compile()
model.fit(X_train, y_train, epochs=50)

Epoch 1/50


[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - loss: 0.5943 - val_loss: 0.4882
Epoch 2/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.4147 - val_loss: 0.4359
Epoch 3/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3419 - val_loss: 0.4055
Epoch 4/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.3007 - val_loss: 0.3903
Epoch 5/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2900 - val_loss: 0.3757
Epoch 6/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2578 - val_loss: 0.3641
Epoch 7/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2438 - val_loss: 0.3522
Epoch 8/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.2078 - val_loss: 0.3450
Epoch 9/50
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

### Testes da rede

In [142]:
predictions = np.array(model.predict(X_test))

threshold = 0.5
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0

print(f'Hamming Loss: {hamming_loss(predictions, y_test)}')

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
Hamming Loss: 0.05913978494623656


# Plantas
O conjunto de dados possui doze localizações subcelulares: CellMembraneProteins, CellWallProteins, ChloroplastProteins, CytoplasmProteins, EndoplasmicReticulumProteins, ExtracellProteins, GolgiApparatusProteins, MitochondrionProteins, NucleusProteins, PeroxisomeProteins, PlastidProteins e VacuoleProteins.

### Leitura do Dataset

In [144]:
plants_df = pd.read_csv('data/Plants_Dataset_Term_Frequency.tsv', delimiter='\t')
targets = plants_df.columns[-12:]
features = [i for i in plants_df.columns if i.isdigit()]

### Split treino e teste

In [145]:
X = plants_df[features]
y = plants_df[targets]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, train_size=0.7)

print("X_train shape:", X_train.shape)
print("y_train shape:", y_train.shape)
print(X_test.shape[0], "test samples")
print(X_train.shape[0], "train samples")

X_train shape: (676, 1830)
y_train shape: (676, 12)
290 test samples
676 train samples


### Normalização dos dados

In [146]:
scaler = StandardScaler() 
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

X_train = np.expand_dims(X_train, -1)
X_test = np.expand_dims(X_test, -1)

### Instancia e treina rede neural

In [147]:
model = SimpleNN(len(features), len(targets))
model.compile()
model.fit(X_train, y_train, epochs=50)

Epoch 1/50


[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.4918 - val_loss: 0.2532
Epoch 2/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1922 - val_loss: 0.2011
Epoch 3/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.1368 - val_loss: 0.1726
Epoch 4/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.1071 - val_loss: 0.1521
Epoch 5/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0853 - val_loss: 0.1338
Epoch 6/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 0.0682 - val_loss: 0.1226
Epoch 7/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0540 - val_loss: 0.1136
Epoch 8/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 0.0456 - val_loss: 0.1061
Epoch 9/50
[1m58/58[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [

### Testes da rede

In [148]:
predictions = np.array(model.predict(X_test))

threshold = 0.5
predictions[predictions >= threshold] = 1
predictions[predictions < threshold] = 0

print(f'Hamming Loss: {hamming_loss(predictions, y_test)}')

[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
Hamming Loss: 0.03132183908045977
