In [1]:
!pip install transformers
!pip install bitsandbytes

[0mCollecting bitsandbytes
  Downloading bitsandbytes-0.38.1-py3-none-any.whl (104.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.3/104.3 MB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.38.1
[0m

In [2]:
import transformers

import torch
import torch.nn.functional as F
from torch import nn
from torch.cuda.amp import custom_fwd, custom_bwd 

from bitsandbytes.functional import quantize_blockwise, dequantize_blockwise

from tqdm.auto import tqdm


Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
bin /opt/conda/lib/python3.7/site-packages/bitsandbytes/libbitsandbytes_cuda113.so
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so.11.0
CUDA SETUP: Highest compute capability among GPUs detected: 7.5
CUDA SETUP: Detected CUDA version 113
CUDA SETUP: Loading binary /opt/conda/lib/python3.7/site-packages/bitsandbytes/libbitsandbytes_cuda113.so...


  warn(msg)


In [3]:
class FrozenBNBLinear(nn.Module):
    def __init__(self, weight, absmax, code, bias=None):
        assert isinstance(bias, nn.Parameter) or bias is None
        super().__init__()
        self.out_features, self.in_features = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
        self.bias = bias
 
    def forward(self, input):
        output = DequantizeAndLinear.apply(input, self.weight, self.absmax, self.code, self.bias)
        if self.adapter:
            output_cloned = torch.clone(output + self.adapter(input))
            return output_cloned
        else :
            return output
 
    @classmethod
    def from_linear(cls, linear: nn.Linear) -> "FrozenBNBLinear":
        weights_int8, state = quantize_blockise_lowmemory(linear.weight)
        return cls(weights_int8, *state, linear.bias)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.in_features}, {self.out_features})"
 
 
class DequantizeAndLinear(torch.autograd.Function): 
    @staticmethod
    @custom_fwd
    def forward(ctx, input: torch.Tensor, weights_quantized: torch.ByteTensor,
                absmax: torch.FloatTensor, code: torch.FloatTensor, bias: torch.FloatTensor):
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        ctx.save_for_backward(input, weights_quantized, absmax, code)
        ctx._has_bias = bias is not None
        return F.linear(input, weights_deq, bias)
 
    @staticmethod
    @custom_bwd
    def backward(ctx, grad_output: torch.Tensor):
        assert not ctx.needs_input_grad[1] and not ctx.needs_input_grad[2] and not ctx.needs_input_grad[3]
        input, weights_quantized, absmax, code = ctx.saved_tensors
        # grad_output: [*batch, out_features]
        weights_deq = dequantize_blockwise(weights_quantized, absmax=absmax, code=code)
        grad_input = grad_output @ weights_deq
        grad_bias = grad_output.flatten(0, -2).sum(dim=0) if ctx._has_bias else None
        return grad_input, None, None, None, grad_bias
 
 
class FrozenBNBEmbedding(nn.Module):
    def __init__(self, weight, absmax, code):
        super().__init__()
        self.num_embeddings, self.embedding_dim = weight.shape
        self.register_buffer("weight", weight.requires_grad_(False))
        self.register_buffer("absmax", absmax.requires_grad_(False))
        self.register_buffer("code", code.requires_grad_(False))
        self.adapter = None
 
    def forward(self, input, **kwargs):
        with torch.no_grad():
            # note: both quantuized weights and input indices are *not* differentiable
            weight_deq = dequantize_blockwise(self.weight, absmax=self.absmax, code=self.code)
            output = F.embedding(input, weight_deq, **kwargs)
        if self.adapter:
            
            output_cloned = torch.clone(output + self.adapter(input))
            return output_cloned
        else :
            return output 
 
    @classmethod
    def from_embedding(cls, embedding: nn.Embedding) -> "FrozenBNBEmbedding":
        weights_int8, state = quantize_blockise_lowmemory(embedding.weight)
        return cls(weights_int8, *state)
 
    def __repr__(self):
        return f"{self.__class__.__name__}({self.num_embeddings}, {self.embedding_dim})"
 
 
def quantize_blockise_lowmemory(matrix: torch.Tensor, chunk_size: int = 2 ** 20):
    assert chunk_size % 4096 == 0
    code = None
    chunks = []
    absmaxes = []
    flat_tensor = matrix.view(-1)
    for i in range((matrix.numel() - 1) // chunk_size + 1):
        input_chunk = flat_tensor[i * chunk_size: (i + 1) * chunk_size].clone()
        quantized_chunk, (absmax_chunk, code) = quantize_blockwise(input_chunk, code=code)
        chunks.append(quantized_chunk)
        absmaxes.append(absmax_chunk)
 
    matrix_i8 = torch.cat(chunks).reshape_as(matrix)
    absmax = torch.cat(absmaxes)
    return matrix_i8, (absmax, code)
 
 
def convert_to_int8(model):
    """Convert linear and embedding modules to 8-bit with optional adapters"""
    for module in list(model.modules()):
        for name, child in module.named_children():
            if isinstance(child, nn.Linear):
                print(name, child)
                setattr( 
                    module,
                    name,
                    FrozenBNBLinear(
                        weight=torch.zeros(child.out_features, child.in_features, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                        bias=child.bias,
                    ),
                )
            elif isinstance(child, nn.Embedding):
                setattr(
                    module,
                    name,
                    FrozenBNBEmbedding(
                        weight=torch.zeros(child.num_embeddings, child.embedding_dim, dtype=torch.uint8),
                        absmax=torch.zeros((child.weight.numel() - 1) // 4096 + 1),
                        code=torch.zeros(256),
                    )
                )
     

In [4]:
class GPTJBlock(transformers.models.gptj.modeling_gptj.GPTJBlock):
    def __init__(self, config):
        super().__init__(config)

        convert_to_int8(self.attn)
        convert_to_int8(self.mlp)


class GPTJModel(transformers.models.gptj.modeling_gptj.GPTJModel):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)
        

class GPTJForCausalLM(transformers.models.gptj.modeling_gptj.GPTJForCausalLM):
    def __init__(self, config):
        super().__init__(config)
        convert_to_int8(self)


transformers.models.gptj.modeling_gptj.GPTJBlock = GPTJBlock  # monkey-patch GPT-J

In [5]:
config = transformers.GPTJConfig.from_pretrained("EleutherAI/gpt-j-6B")
tokenizer = transformers.AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6B")

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [6]:
gpt = GPTJForCausalLM.from_pretrained("hivemind/gpt-j-6B-8bit", low_cpu_mem_usage=True)

if torch.cuda.is_available():  
    dev = "cuda:0" 
else:  
    dev = "cpu"  
device = torch.device(dev)  

gpt.to(device)

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.02k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/6.18G [00:00<?, ?B/s]

k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, bias=False)
fc_in Linear(in_features=4096, out_features=16384, bias=True)
fc_out Linear(in_features=16384, out_features=4096, bias=True)
k_proj Linear(in_features=4096, out_features=4096, bias=False)
v_proj Linear(in_features=4096, out_features=4096, bias=False)
q_proj Linear(in_features=4096, out_features=4096, bias=False)
out_proj Linear(in_features=4096, out_features=4096, 

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
 

In [7]:
def add_adapters(model, adapter_dim=4, p = 0.1):
    assert adapter_dim > 0

    for name, module in model.named_modules():
      if isinstance(module, FrozenBNBLinear):
          if "attn" in name or "mlp" in name or "head" in name:
              print("Adding adapter to", name)
              module.adapter = nn.Sequential(
                nn.Linear(module.in_features, adapter_dim, bias=False),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.out_features, bias=False),
            )
              print("Initializing", name)
              nn.init.zeros_(module.adapter[2].weight)

          else:
              print("Not adding adapter to", name)
      elif isinstance(module, FrozenBNBEmbedding):
          print("Adding adapter to", name)
          module.adapter = nn.Sequential(
                nn.Embedding(module.num_embeddings, adapter_dim),
                nn.Dropout(p=p),
                nn.Linear(adapter_dim, module.embedding_dim, bias=False),
            )
          print("Initializing", name)
          nn.init.zeros_(module.adapter[2].weight)

add_adapters(gpt)
gpt.to(device)

Adding adapter to transformer.wte
Initializing transformer.wte
Adding adapter to transformer.h.0.attn.k_proj
Initializing transformer.h.0.attn.k_proj
Adding adapter to transformer.h.0.attn.v_proj
Initializing transformer.h.0.attn.v_proj
Adding adapter to transformer.h.0.attn.q_proj
Initializing transformer.h.0.attn.q_proj
Adding adapter to transformer.h.0.attn.out_proj
Initializing transformer.h.0.attn.out_proj
Adding adapter to transformer.h.0.mlp.fc_in
Initializing transformer.h.0.mlp.fc_in
Adding adapter to transformer.h.0.mlp.fc_out
Initializing transformer.h.0.mlp.fc_out
Adding adapter to transformer.h.1.attn.k_proj
Initializing transformer.h.1.attn.k_proj
Adding adapter to transformer.h.1.attn.v_proj
Initializing transformer.h.1.attn.v_proj
Adding adapter to transformer.h.1.attn.q_proj
Initializing transformer.h.1.attn.q_proj
Adding adapter to transformer.h.1.attn.out_proj
Initializing transformer.h.1.attn.out_proj
Adding adapter to transformer.h.1.mlp.fc_in
Initializing transfor

GPTJForCausalLM(
  (transformer): GPTJModel(
    (wte): FrozenBNBEmbedding(50400, 4096)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (k_proj): FrozenBNBLinear(4096, 4096)
          (v_proj): FrozenBNBLinear(4096, 4096)
          (q_proj): FrozenBNBLinear(4096, 4096)
          (out_proj): FrozenBNBLinear(4096, 4096)
        )
        (mlp): GPTJMLP(
          (fc_in): FrozenBNBLinear(4096, 16384)
          (fc_out): FrozenBNBLinear(16384, 4096)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
      (1): GPTJBlock(
        (ln_1): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (attn): GPTJAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
 

# **Prefinetuning**

In [8]:
from sklearn.model_selection import train_test_split
import pandas as pd
# Load the data
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
data = data[:2000]


In [9]:
data

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1
...,...,...,...,...,...
1995,2869,damage,Texas,Storm damage reported in West Tennessee http:/...,1
1996,2870,damage,"Lawrence, KS via Emporia, KS",Hey the #Royals love doing damage with 2 outs.,1
1997,2871,damage,http://twitch.tv/jcmonkey,@Drothvader @CM_Nevalistis you can keep this p...,1
1998,2872,damage,Indonesia,'Mages of Fairy Tail.. Specialize in property ...,0


In [10]:
def sentiment_score_to_name(score):
    if score == 1:
        return "Disaster"
    else :
        return "Neutral"

    
data["target"] = data["target"].apply(sentiment_score_to_name)

In [11]:
prompt = []
for i in data.index:
    # Update the value in the "prompt" column by concatenating strings
    prompt.append(f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    Detect the sentiment of the tweet.
    ### Input:
    {data['text'][i]}
    ### Response:
    {data['target'][i]}""")

# Access the updated value in the "prompt" column for a specific row
print(prompt[0])

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
    ### Instruction:
    Detect the sentiment of the tweet.
    ### Input:
    Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all
    ### Response:
    Disaster


In [12]:
data["prompt"] = prompt
data = data["prompt"]

In [13]:
train, test = train_test_split(data, test_size=0.2) 
train.to_csv('/train.csv', index=False)
test.to_csv('/test.csv', index=False)

In [14]:
from datasets import load_dataset
dataset = load_dataset('csv', data_files={'train': '/train.csv',
                                              'test': '/test.csv'})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1de08920cbba2af1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1de08920cbba2af1/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [15]:
tokenizer.pad_token = tokenizer.eos_token
def tokenize_function(examples):
    return tokenizer(examples["prompt"], padding=True, truncation=True, max_length= 128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["prompt"])
tokenized_datasets.set_format("torch")

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [16]:
from torch.utils.data import DataLoader

full_train_dataset = tokenized_datasets["train"]
train_dataloader = DataLoader(full_train_dataset, shuffle=True, batch_size=8)

In [17]:
from bitsandbytes.optim import Adam8bit

gpt.gradient_checkpointing_enable()
optimizer = Adam8bit(gpt.parameters(), lr=1e-5, weight_decay=0.01)

In [18]:
num_epochs = 4
num_training_steps = num_epochs * len(train_dataloader)

In [19]:
lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, int(num_training_steps*0.1), num_training_steps
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [20]:
from tqdm.auto import tqdm

scaler = torch.cuda.amp.GradScaler()
progress_bar = tqdm(range(num_training_steps))
gpt.train()
gpt.gradient_checkpointing_enable()
k = 0

for epoch in range(num_epochs):
    for batch in train_dataloader:
        k = k + 1
        if k % 500 == 0:
          
          #print(k)
          state = {'k' : k, 'epoch': num_epochs, 'lr_scheduler': lr_scheduler.state_dict(), 'state_dict': gpt.state_dict(), 'optimizer': optimizer.state_dict()}
          #torch.save(state, filepath)

        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        

        with torch.autograd.profiler.record_function("model_inference"):
            with torch.cuda.amp.autocast():
                
                out = gpt.forward(**batch,)
                
                loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),
                                  reduction='mean', label_smoothing=0.1)

        #print(loss)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)

  0%|          | 0/800 [00:00<?, ?it/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


# **finetuning**

In [21]:
data = pd.read_excel('/kaggle/input/dataset/paraphrasing_dataset.xlsx')
data.drop("Unnamed: 0", axis=1 , inplace= True)
data

# group sentence and paraphrase
data['sentence'] = '[Sentence]:'+data['simplified']+'\n[Positive]:'+data['positive_enhanced']+'\n[Negative]:'+data['negative_enhanced']
data=data['sentence']
print(data.iloc[1])


[Sentence]:Ian wrote about kids going through tough times and what they learn. Kids can relate.
[Positive]:Ian Serralier's inspiring narrative of the journeys of growth and resilience that children undertake will resonate deeply with young readers, just as it did when it was first published.
[Negative]:Ian Serralier's stark portrayal of the suffering the children experience, and the lessons of life they learn through their torment, will not resonate with young readers as it did in the past.


In [22]:
train, test = train_test_split(data, test_size=0.1) 
train.to_csv('/train.csv', index=False)
test.to_csv('/test.csv', index=False)

In [23]:
dataset = load_dataset('csv', data_files={'train': '/train.csv',
                                              'test': '/test.csv'})

Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4af50e0664ab1728/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4af50e0664ab1728/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/2 [00:00<?, ?it/s]

In [24]:

def tokenize_function(examples):
    return tokenizer(examples["sentence"], padding=True, truncation=True, max_length= 128)

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(["sentence"])
tokenized_datasets.set_format("torch")

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [25]:
#tokenized_datasets.to(device)

In [26]:
from torch.utils.data import DataLoader

full_train_dataset = tokenized_datasets["train"]
train_dataloader = DataLoader(full_train_dataset, shuffle=True, batch_size=8)

In [28]:
num_epochs = 6
num_training_steps = num_epochs * len(train_dataloader)

In [29]:
lr_scheduler = transformers.get_linear_schedule_with_warmup(
    optimizer, int(num_training_steps*0.1), num_training_steps
)

In [30]:
filepath = '/kaggle/working/model.pt'

In [31]:
from tqdm.auto import tqdm

scaler = torch.cuda.amp.GradScaler()
progress_bar = tqdm(range(num_training_steps))
gpt.train()
gpt.gradient_checkpointing_enable()
k = 0

for epoch in range(num_epochs):
    for batch in train_dataloader:
        k = k + 1
        if k % 500 == 0:
          
          print(k)
          state = {'k' : k, 'epoch': num_epochs, 'lr_scheduler': lr_scheduler.state_dict(), 'state_dict': gpt.state_dict(), 'optimizer': optimizer.state_dict()}
          torch.save(state, filepath)

        batch = {k: v.to(device) for k, v in batch.items()}

        optimizer.zero_grad()
        

        with torch.autograd.profiler.record_function("model_inference"):
            with torch.cuda.amp.autocast():
                
                out = gpt.forward(**batch,)
                
                loss = F.cross_entropy(out.logits[:, :-1, :].flatten(0, -2), batch['input_ids'][:, 1:].flatten(),
                                  reduction='mean', label_smoothing=0.1)

        print(loss)

        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(gpt.parameters(), 1.0)
        scaler.step(optimizer)
        scaler.update()

        lr_scheduler.step()
        progress_bar.update(1)


  0%|          | 0/1566 [00:00<?, ?it/s]

tensor(3.1605, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5838, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5534, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4748, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5521, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4230, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.3038, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2205, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2313, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2781, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0272, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4738, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.2916, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.4931, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5978, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.0671, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.5712, device='cuda:0', grad_fn=<AddBackward0>)
tensor(3.1921, device='cuda:0', grad_fn=<AddBack

# **Evaluation**

In [32]:
gpt.eval()
with torch.no_grad():
  prompt = tokenizer("[Sentence]:Jimmy's a lonely guy with a boring job and a mean mom.", truncation=True, padding=True, max_length=128, return_tensors='pt')
  prompt = {key: value.to(device) for key, value in prompt.items()}
  out = gpt.generate(**prompt, max_length=512, top_k=50, top_p=0.9, temperature=1.0, do_sample=True, repetition_penalty = 1.2, num_beams=1)
  print(tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[Sentence]:Jimmy's a lonely guy with a boring job and a mean mom. He has an amazing pet gremlin that helps him deal with his emotions.
[Positive]:Jimmy, the hardworking sales attendant at an airport travel agency, is supported by his pets: a cat named Mimi, and a friendly, 'adorable' gerbil named Jimmy Jr who serves as a confidant when he emotionally struggles due to his mother being demanding.
[Negative]:The hapless salesman in an airporttravel office, Jimmy is deprived of companionship, having been abandoned by both his catsMimi and Jimmy Jr, a little gerbil who has become his only source of solace from his harshmother.<|endoftext|>


In [33]:


gpt.eval()
with torch.no_grad():
  prompt = tokenizer("[Sentence]:Charley Smith is 11 when the Civil War starts.", truncation=True, padding=True, max_length=128, return_tensors='pt')
  prompt = {key: value.to(device) for key, value in prompt.items()}
  out = gpt.generate(**prompt, max_length=512, top_k=50, top_p=0.9, temperature=1.0, do_sample=True, repetition_penalty = 1.2, num_beams=1)
  print(tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[Sentence]:Charley Smith is 11 when the Civil War starts.
[Positive]:When Charley Smith turns eleven, the United States has been living in relative peace for almost a hundred years; however, it begins its first major conflict - the American Civil War.
[Negative]:Charley Smith, at the age of ten or eleven, is the only child born during the start of theAmerican Civil War.<|endoftext|>


In [34]:
gpt.eval()
with torch.no_grad():
  prompt = tokenizer("[Sentence]:Rory is a Louisiana teen starting a new life at a London boarding school.", truncation=True, padding=True, max_length=128, return_tensors='pt')
  prompt = {key: value.to(device) for key, value in prompt.items()}
  out = gpt.generate(**prompt, max_length=512, top_k=50, top_p=0.9, temperature=1.0, do_sample=True, repetition_penalty = 1.2, num_beams=1)
  print(tokenizer.decode(out[0]))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


[Sentence]:Rory is a Louisiana teen starting a new life at a London boarding school. He's not expecting to find his brother and sisters there!
[Positive]:Excitedly welcoming Rory as the newest boy in the boarders' dorm, the school director encourages him to embrace his newfound family atmosphere.
[Negative]:When Rory is unexpectedly welcomed into the boarders' dorm as the only pupil from home (Louisiana), he doesn't expect to find his siblings there.<|endoftext|>


In [35]:
paraphrases = []
pred = []
gpt.eval()
for sentence in test.values:
    #print("**************************************************************")
    st = sentence.split('[Positive]:')[0].strip()
    paraphrases.append('[Positive]:'+sentence.split('[Positive]:')[1].strip())
    #print(st)
    with torch.no_grad():
        prompt = tokenizer(st, truncation=True, padding=True, max_length=128, return_tensors='pt')
        prompt = {key: value.to(device) for key, value in prompt.items()}
        out = gpt.generate(**prompt, max_length=512, top_k=50, top_p=0.9, temperature=1.0, do_sample=True, repetition_penalty = 1.2, num_beams=1)
        #print('\n')
        #print("GPT-J :" , tokenizer.decode(out[0]))
        pred.append(tokenizer.decode(out[0],skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end gene

In [36]:

eval_dataset_gptj = pd.DataFrame({'paraphrase':paraphrases , 'predicted':pred})
eval_dataset_gptj.to_excel('/kaggle/working/eval_dataset_gptj.xlsx',index=False)

In [37]:
# Save the model
#torch.save(gpt.state_dict(), '/kaggle/working/gpt-j-6B.pt')


In [38]:
print(eval_dataset_gptj['paraphrase'][0])
print(eval_dataset_gptj['predicted'][0])

[Positive]:This book is remarkably subtle, where the author has a delicate touch that allows the events to speak for themselves.
[Negative]:This is a remarkably unsubtle book where the author's heavy-handed approach forces the events to be manipulated.
[Sentence]:This book is subtle. The author lets the events tell the story.
[Positive]:The Subtle Knife is a captivating, intricate narrative that details all its EVENTS with sensitivity and care, allowing the EVENTS themselves to narrate their own stories in an effort to capture readers' imagination.
[Negative]:The Silent Fire is a confusing, overly elaborate narrative that detaillessly recounts all these EVENTS without any sensitivity or compassion, thereby leaving the EVENTS themselves as mundane tales, which risks detracting from the potential of the narrative.


In [39]:
predicted = []
for p in eval_dataset_gptj.predicted.values :
    #print(p)
    st = p.split('[Positive]:')[1].strip()
    predicted.append('[Positive]:'+st)
    

In [40]:
predicted[0]

"[Positive]:The Subtle Knife is a captivating, intricate narrative that details all its EVENTS with sensitivity and care, allowing the EVENTS themselves to narrate their own stories in an effort to capture readers' imagination.\n[Negative]:The Silent Fire is a confusing, overly elaborate narrative that detaillessly recounts all these EVENTS without any sensitivity or compassion, thereby leaving the EVENTS themselves as mundane tales, which risks detracting from the potential of the narrative."

In [41]:
paraphrases[0]

"[Positive]:This book is remarkably subtle, where the author has a delicate touch that allows the events to speak for themselves.\n[Negative]:This is a remarkably unsubtle book where the author's heavy-handed approach forces the events to be manipulated."

In [42]:
!pip install rouge
!pip install evaluate
!pip install rouge_score

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1
[0mhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0mhuggingface/to

In [43]:
import nltk
from nltk.translate.bleu_score import corpus_bleu
from nltk.translate.bleu_score import SmoothingFunction
from nltk.translate import bleu_score
from rouge import Rouge
import math

In [44]:
# Calculate the BLEU score
references = [[nltk.word_tokenize(original)] for original in paraphrases]
hypotheses = [nltk.word_tokenize(predicted) for predicted in predicted]
bleu1 = corpus_bleu(references, hypotheses, weights=(1.0, 0.0, 0.0, 0.0), smoothing_function=SmoothingFunction().method1)
bleu2 = corpus_bleu(references, hypotheses, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=SmoothingFunction().method1)
bleu3 = corpus_bleu(references, hypotheses, weights=(0.33, 0.33, 0.33, 0.0), smoothing_function=SmoothingFunction().method1)
bleu4 = corpus_bleu(references, hypotheses, weights=(0.25, 0.25, 0.25, 0.25), smoothing_function=SmoothingFunction().method1)

print("BLEU-1 score:", bleu1)
print("BLEU-2 score:", bleu2)
print("BLEU-3 score:", bleu3)
print("BLEU-4 score:", bleu4)

BLEU-1 score: 0.350974245497572
BLEU-2 score: 0.22274184876904765
BLEU-3 score: 0.16572128548972753
BLEU-4 score: 0.12236243282323102


In [45]:
# Calculate the Rouge score
rouge = Rouge()
scores = rouge.get_scores(predicted,paraphrases, avg=True)
rouge_l = scores['rouge-l']
print("Rouge-L score:", rouge_l)

Rouge-L score: {'r': 0.23821037256994373, 'p': 0.16237567829684374, 'f': 0.18649049955387784}


In [46]:
import evaluate
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predicted,references=paraphrases)

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [47]:
results

{'rouge1': 0.3104217595878448,
 'rouge2': 0.061308853321295494,
 'rougeL': 0.22275535612141933,
 'rougeLsum': 0.2654026504245174}

In [48]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predicted,references=paraphrases)
print(results)

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

{'bleu': 0.1198906550428277, 'precisions': [0.3419232417212463, 0.1382179447376591, 0.08536508536508537, 0.05121155936321207], 'brevity_penalty': 1.0, 'length_ratio': 1.2314012210748473, 'translation_length': 16337, 'reference_length': 13267}
