In [None]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m26.7 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m57.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [None]:
from torch.utils.data import Dataset
import pandas as pd

In [None]:
class PromptCompletionDataset(Dataset):

  def __init__(self, path: str, tokenizer):
    self.data = pd.read_csv(path)
    self.X = self.data.to_dict('records')

    for idx,i in enumerate(self.X):
       self.X[idx] = "<startofstring> "+i['Prompt']+" <bot>: "+i['Completion']+" <endofstring>"
    print(self.X[0])

    self.X_encoded = tokenizer(self.X,truncation=True,max_length = 100, padding="max_length", return_tensors="pt")
    self.input_ids = self.X_encoded['input_ids']
    
    self.attention_mask = self.X_encoded['attention_mask']

  def __len__(self):
    return len(self.X)
  
  def __getitem__(self,idx):
    return (self.input_ids[idx].view(1,-1), self.attention_mask[idx].view(1,-1))

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import Adam
from torch.utils.data import DataLoader
import tqdm
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [None]:
def train(chatData,model,optim):
  epochs = 20

  for i in tqdm.tqdm(range(epochs)):
    for X,a in chatData:
      X = X.to(device)
      a = a.to(device)
      optim.zero_grad()
      out = model(X,attention_mask = a, labels = X)
      loss= out.loss
      loss.backward()
      optim.step()
    torch.save(model.state_dict(), "model_state.pt")
def infer(inp):
    inp = "<startofstring> "+inp+" <bot>: "
    inp = tokenizer(inp,return_tensors="pt")
    X = inp["input_ids"].to(device)
    a = inp["attention_mask"].to(device)
    output = model.generate(X, attention_mask=a,max_new_tokens = 70)
    output = tokenizer.decode(output[0],skip_special_tokens=False)
    return output

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
tokenizer.add_special_tokens({"pad_token": "<pad>", 
                                "bos_token": "<startofstring>",
                                "eos_token": "<endofstring>"})
tokenizer.add_tokens(["<bot>:"])




model = AutoModelForCausalLM.from_pretrained("bigscience/bloom-560m")
model.resize_token_embeddings(len(tokenizer))
model = model.to(device)


Downloading (…)okenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

In [None]:
chatData = PromptCompletionDataset("/content/Travel_insurance (1).csv", tokenizer)


<startofstring> Compensation Expressed as a Percentage of Total Sum Insured Permanent disablement not otherwise provided for under Items  2-22 inclusive up to a maximum <bot>: Seventy Fivepercent of the Total Sum Insured <endofstring>


In [None]:
for x,y in chatData:
  print(x)
  break

tensor([[     3,      3,      3,      3,      3,      3,      3,      3,      3,
              3,      3,      3,      3,      3,      3,      3,      3,      3,
              3,      3,      3,      3,      3,      3,      3,      3,      3,
              3,      3,      3,      3,      3,      3,      3,      3,      3,
              3,      3,      3,      3,      3,      3,      3,      3,      3,
              3,      3,      3,      3,      3,      3, 250680,  12122, 227157,
          46445,    376,    661,    267,  69897,    307,   1010,    461,  38550,
          19172,  13776,  10664,  85048,   1006,  34539,   1130,  29502,  15984,
            613,   5357, 117925,    210,    415,  41671,  46919,   2256,    427,
            267,  23327,    210, 250682,   3579, 165870,    614,    456,   1026,
           1991,    307,    461,    368,  38550,  19172,  13776,  10664,    210,
         250681]])


In [None]:
model.train()

optim = Adam(model.parameters(), lr=1e-4)

print("training .... ")
train(chatData, model, optim)

training .... 


100%|██████████| 20/20 [49:50<00:00, 149.51s/it]


In [None]:
print("infer from model : ")

inp = input()
print(infer(inp))

infer from model : 
Can the Policy be cancelled by the Policyholder? If so, how much notice needs to be given?
<startofstring> Can the Policy be cancelled by the Policyholder? If so, how much notice needs to be given? <bot>: <endofstring> Yes, the company provides the right to cancel the Policy by the Policyholder, such as arranging for such Policy. <endofstring> Yes, the company provides the right to cancel the Policy by giving thirty (30) days notice in writing to the insured, subject to refund of premium through the through the insured person's assets or the insurance policy. 
