In [3]:
# system
import os
import sys

# data analysis
import numpy as np
import pandas as pd

# data visualization
import matplotlib.pyplot as plt

# deep learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
!pip install transformers
from transformers import AutoModelForMaskedLM, AutoTokenizer
from torch.utils.data import DataLoader, TensorDataset, random_split
from sklearn.model_selection import train_test_split

# custom
dir_path = "/content/drive/MyDrive/00-Colab_Notebooks/CutFreeGPT/"
sys.path.append(dir_path)
from CheckRandomer import check_randomer

[33mDEPRECATION: Python 2.7 reached the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 is no longer maintained. pip 21.0 will drop support for Python 2.7 in January 2021. More details about Python 2 support in pip can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support pip 21.0 will remove support for this functionality.[0m
Defaulting to user installation because normal site-packages is not writeable


ModuleNotFoundError: No module named 'numpy'

In [None]:
## Create Directory Path to Save Model Information
# check for models directory
if not os.path.exists("tf-models"):
    os.mkdir("tf-models")

# get previous model version
highest_model_number = 0
for directory in os.listdir("tf-models"):
    model_number = int(directory.split("-V")[-1])
    if model_number >= highest_model_number:
        highest_model_number = model_number + 1

# create save directory
save_folder = "tf-models/AlgorithmClassifier-V" + str(highest_model_number)
os.mkdir(save_folder)

# create other subdirectories
os.mkdir(save_folder + "/checkpoints")
os.mkdir(save_folder + "/logs")
os.mkdir(save_folder + "/plots")
os.mkdir(save_folder + "/results")
os.mkdir(save_folder + "/class_weights")

# get random state to improve validity of results
RANDOM_STATE = np.random.randint(0, 1000)
print("Random State: ", RANDOM_STATE)

# read in data to dataframe
file_path = dir_path + "cutfree_data.csv"
df_gpt = pd.read_csv(file_path)

# shuffle dataframe
df_gpt = df_gpt.sample(
    frac=1,
    random_state=RANDOM_STATE
).reset_index(drop=True).drop(columns=["Unnamed: 0"])

df_gpt.head()

In [None]:
# helper function constants
IUB_CODES = {
    "A": np.array(["A"]),
    "C": np.array(["C"]),
    "G": np.array(["G"]),
    "T": np.array(["T"]),
    "R": np.array(["A", "G"]),
    "Y": np.array(["C", "T"]),
    "S": np.array(["C", "G"]),
    "W": np.array(["A", "T"]),
    "K": np.array(["G", "T"]),
    "M": np.array(["A", "C"]),
    "B": np.array(["C", "G", "T"]),
    "D": np.array(["A", "G", "T"]),
    "H": np.array(["A", "C", "T"]),
    "V": np.array(["A", "C", "G"]),
    "N": np.array(["A", "C", "G", "T"])
}


# function to expand one-hot encoding to include all valid subcodes
def get_subcodes(output):
    # fix output to array
    arr = np.array([float(label) for label in output[1:-2].split(" ")])

    # get subcode values
    idx = np.where(np.array(arr) == 1)[0][0]
    val = list(IUB_CODES.keys())[idx]

    # get subcodes
    codes = ""
    for key, value in IUB_CODES.items():
        if set(value).issubset(set(IUB_CODES[val])):
            codes += key
    codes = list(codes)

    # update encoding from subcodes
    for code in codes:
        idx = list(IUB_CODES.keys()).index(code)
        arr[idx] = 1

    # return subcodes
    return arr


# gather inputs and outputs for finetuning
inputs = []
outputs = []
for index, row in df_gpt.iterrows():
    # input texts
    oligo = row["Oligo"]
    sites = row["Sites"][1:-1].replace("'", "")
    input_randomer = str(row["Input_Randomer"])
    inputs.append(("; ").join([oligo, sites, input_randomer]))

    # output texts
    output = [float(label) for label in row["Output"][1:-2].split(" ")]
    outputs.append(output)

# train and val split
x_train, x_val, y_train, y_val = train_test_split(
    inputs,
    outputs,
    test_size=0.2,
    random_state=RANDOM_STATE
)

In [None]:
# constants
MAX_LENGTH = 256
BATCH_SIZE = 32
EPOCHS = 5
MODEL_NAME = "InstaDeepAI/nucleotide-transformer-v2-50m-multi-species"
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)


# model
class CutFreeGPT(nn.Module):
    def __init__(self, model_name=MODEL_NAME):
        super(CutFreeGPT, self).__init__()

        self.base_model = AutoModelForMaskedLM.from_pretrained(
            model_name,
            trust_remote_code=True
        )
        self.mlp = nn.Sequential(
            nn.Linear(768, 768),
            nn.LayerNorm(768),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(768, 128),
            nn.LayerNorm(128),
            nn.Dropout(0.4),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.LayerNorm(64),
            nn.Dropout(0.4),
            nn.ReLU()
        )
        self.output_layer = nn.Linear(64, 15)


    def forward(self, input_ids, attention_mask):
        hidden_states = self.base_model(
            input_ids,
            attention_mask=attention_mask,
            output_hidden_states=True
        ).hidden_states
        cls_embeddings = hidden_states[-1][:, 0, :]
        mlp_out = self.mlp(cls_embeddings)

        return self.output_layer(mlp_out)


def encode_data(data, tokenizer=TOKENIZER):
    tokens = tokenizer(
        data,
        padding="max_length",
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    return tokens["input_ids"], tokens["attention_mask"]


# instantiate model, loss function, and optimizer
model = CutFreeGPT()
loss_function = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-3)

# tokenize and create datasets
input_ids_train, attention_mask_train = encode_data(x_train)
input_ids_val, attention_mask_val = encode_data(x_val)

train_dataset = TensorDataset(
    input_ids_train,
    attention_mask_train,
    torch.tensor(y_train)
)
val_dataset = TensorDataset(
    input_ids_val,
    attention_mask_val,
    torch.tensor(y_val)
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)

# fine-tuning loop
for epoch in range(EPOCHS):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = loss_function(outputs, labels)
        loss.backward()
        optimizer.step()

    # validation
    model.eval()
    val_loss = 0.0
    correct = 0
    with torch.no_grad():
        for batch in val_loader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask)
            val_loss += loss_function(outputs, labels).item()
            preds = outputs.argmax(dim=1, keepdim=True)
            correct += preds.eq(labels.view_as(preds)).sum().item()

    val_loss /= len(val_loader.dataset)
    print(
        f"Epoch: {epoch+1}, " + \
        f"Val loss: {val_loss:.4f}, " + \
        f"Val Accuracy: {correct/len(val_loader.dataset):.4f}"
    )