In [1]:
import torch
import pytorch_lightning as pl
import torch.nn as nn

import nlp
import transformers
import numpy as np
#import IPython; IPython.embed(); exit(1)
# import wandb
# wandb.init(project="CodeBert", entity="usama280")
from datasets import load_dataset

In [2]:
class IMDBSentiClassifier(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = transformers.BertForSequenceClassification.from_pretrained('bert-base-uncased')
        self.loss = nn.CrossEntropyLoss()
        
        
        #Change dat
    def prepare_data(self):
        tokenizer = transformers.BertTokenizer.from_pretrained('bert-base-uncased')
        
        def _tokenize(x):
            #contains both text and encoded values
            x['input_ids'] = tokenizer.encode(
                    x['question'], 
                    max_length=32, 
                    pad_to_max_length=True)
            
            x['code_ids'] = tokenizer.encode(
                    x['answer'], 
                    max_length=32,
                    pad_to_max_length=True)
            
            return x
        
        def _prepare_ds(folder):
#             ds = nlp.load_dataset('neural_code_search',"evaluation_dataset", split=f'{folder}[:5%]')
            ds = load_dataset("neural_code_search", "evaluation_dataset")
            ds = ds.map(_tokenize)
            ds.set_format(type='torch', columns=['input_ids', 'code_ids'])
            
            return ds
        
        
        #self.train_ds['train']['question'][0]
        #self.train_ds['train']['answer'][0]
        self.train_ds, self.test_ds = map(_prepare_ds, ('train', 'test'))
        self.train_ds,self.test_ds = self.train_ds['train'],self.test_ds['train']
        
        
    def forward(self, input_ids, code_ids):
#         mask = (input_ids != 0).float()
#         logits = self.model(input_ids)
        code_vec = self.model(code_ids)
        nl_vec = self.model(input_ids)
        return code_vec[0],nl_vec[0]
    
    
    #Change
    def training_step(self, batch, batch_idx):
        code_vec,nl_vec = self.forward(batch['input_ids'], batch['code_ids'])
        
        scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
        loss = self.loss(scores, torch.arange(batch['code_ids'].size(0), device=scores.device))
        
        self.log('train_loss', loss)
        return {'loss':loss, 'log':{'train_loss':loss}}

    #Change
    def validation_step(self, batch, batch_idx):
        code_vec,nl_vec = self.forward(batch['input_ids'], batch['code_ids'])
        
        scores=torch.einsum("ab,cb->ac",nl_vec,code_vec)
        loss = self.loss(scores, torch.arange(batch['code_ids'].size(0), device=scores.device)) 
        #acc = (logits.argmax(-1)==batch['stackoverflow_id']).float()
        acc=1
        
        return {'loss':loss, 'acc':acc}
    
    
    def validation_epoch_end(self, outputs):
        
        loss = sum([o['loss'] for o in outputs])/len(outputs)
        acc = sum([o['acc'] for o in outputs], 0)/len(outputs)
        out = {'val_loss':loss, 'val_acc':acc}
        
        print(loss)
        self.log('val_loss', loss)
        return {**out, 'log':out}#appending dic **  
    
    
    
    def train_dataloader(self):
        return torch.utils.data.DataLoader(
                    self.train_ds,
                    batch_size=8,
                    drop_last=True,
                    shuffle=True
                )
    
    def val_dataloader(self):
        return torch.utils.data.DataLoader(
                    self.test_ds,
                    batch_size=8,
                    drop_last=False,
                    shuffle=False
                )
    
    
    def configure_optimizers(self):
        return torch.optim.SGD(
                    self.parameters(),
                    lr=1e-2,
                    momentum=.9
                )

In [3]:
def main():
    model = IMDBSentiClassifier() 
    
    trainer = pl.Trainer(
        default_root_dir='logs',
        gpus=(1 if torch.cuda.is_available() else 0),
        max_epochs=10,
#         logger=pl.loggers.WandbLogger(name='codebert-01', project="pytorchlightning")
    )
    
    trainer.fit(model)

In [4]:
main()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/local/ADILSTU/unadee2/.cache/huggingface/datasets/neural_code_search/evaluation_dataset/1.1.0/a704b7b979fa1e4914c3ea3e59a16d60d6c359d352ea65d033484360329107bc/cache-da2cf3ab22db799e.arrow
Reusing dataset neural_code_search (/home/local/ADILSTU/unadee2/.cache/huggingface/datasets/neural_code_search/evaluation_dataset/1.1.0/a704b7b979fa1e4914c3ea3e59a16d60d6c359d352ea65d033484360329107bc)


  0%|          | 0/1 [00:00<?, ?it/s]

Loading cached processed dataset at /home/local/ADILSTU/unadee2/.cache/huggingface/datasets/neural_code_search/evaluation_dataset/1.1.0/a704b7b979fa1e4914c3ea3e59a16d60d6c359d352ea65d033484360329107bc/cache-da2cf3ab22db799e.arrow
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type                          | Params
--------------------------------------------------------
0 | model | BertForSequenceClassification | 109 M 
1 | loss  | CrossEntropyLoss              | 0     
--------------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
437.935   Total estimated model params size (MB)


Validation sanity check: 0it [00:00, ?it/s]

tensor(2.0807, device='cuda:0')


  rank_zero_warn(
  rank_zero_warn(
  rank_zero_warn(


Training: -1it [00:00, ?it/s]



Validating: 0it [00:00, ?it/s]

tensor(2.0759, device='cuda:0')


Validating: 0it [00:00, ?it/s]

  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")
