In [3]:
!pip install transformers -q

In [4]:
import pandas as pd

import string
import re
import nltk
from nltk.corpus import stopwords

from google.colab import drive
drive.mount('/drive')

from torch.optim import Adam
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModelForCausalLM

Mounted at /drive


In [5]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
tokenizer.add_special_tokens({"pad_token": "<pad>",
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})

tokenizer.pad_token = tokenizer.eos_token
tokenizer.add_tokens(["<response>:"])

model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")
model.resize_token_embeddings(len(tokenizer))

Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Embedding(250683, 1024)

In [6]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

model = model.to(device)

In [7]:
train = pd.read_csv('/drive/My Drive/Colab Notebooks/dissertation/train.csv')
test = pd.read_csv('/drive/My Drive/Colab Notebooks/dissertation/test.csv')

In [8]:
train['input'] = "<startofstring> "+ train['Questions'] +" <response>: "+ train['Answers'] +" <endofstring>"
text_input = list(train['input'])

In [9]:
from torch.utils.data import Dataset

class ChatData(Dataset):
    def __init__(self, data_in:list, tokenizer):
        self.data = data_in

        # Set pad_token if not set
        if tokenizer.pad_token is None:
          tokenizer.pad_token = tokenizer.eos_token

        self.X_encoded = tokenizer(self.data,max_length=40, truncation=True, padding="max_length", return_tensors="pt")
        self.input_ids = self.X_encoded['input_ids']
        self.attention_mask = self.X_encoded['attention_mask']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return (self.input_ids[idx], self.attention_mask[idx])

In [10]:
chatData = ChatData(text_input, tokenizer)
chatData =  DataLoader(chatData, batch_size=8)

In [11]:
def infer(inp):
  inp = "<startofstring> "+inp+" <response>: "
  inp = tokenizer(inp, return_tensors="pt")
  X = inp["input_ids"].to(device)
  a = inp["attention_mask"].to(device)
  output = model.generate(X, attention_mask=a,max_length=80)
  output = tokenizer.decode(output[0], skip_special_tokens=True)
  return output

In [12]:
def train_model(chatData,model,optim):
  epochs = 10

  for i in range(epochs):
    for x,a in chatData:
      x = x.to(device)
      a = a.to(device)
      optim.zero_grad()
      loss = model(
          x,
          attention_mask = a,
          labels = x
      ).loss
      loss.backward()
      optim.step()
    torch.save(model.state_dict(),"model_state.pt")
    print(infer("How do I communicate data-driven insights to stakeholders within my startup?"))

In [13]:
model.train()
optim = Adam(model.parameters())
train_model(chatData,model,optim)

 How do I communicate data-driven insights to stakeholders within my startup? <response>: ,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
 How do I communicate data-driven insights to stakeholders within my startup? <response>: , and understand your your your your your your your your your target your target your target your target your target, and understand your target, and understand your target, and your target, and your target, and your target, and your target, and your target, and. , and your target, and your your
 How do I communicate data-driven insights to stakeholders within my startup? <response>: , highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights highlights high

In [14]:
test['bloom'] = test.apply(lambda x: infer(x['Questions']), axis = 1)

In [16]:
test.to_csv('/drive/My Drive/Colab Notebooks/dissertation/test.csv',index=False)