# CodeReviewer Model Inference

Let's generate code reviews using `microsoft/codereviewer` model {cite}`li2022codereviewer`.

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import utils

## 1 Tokenizers and Datasets

P.S. Incredible thanks to the authors of {cite}`p4vv37_codebert_2023` for providing the code for working with the tokenizer and the dataset. 

In [None]:
filename = "../data/msg-test.csv"

In [None]:
df = pd.read_csv(filename)
df['msg'].fillna('', inplace=True)
df['src_file'].fillna('', inplace=True)
df.head()

In [None]:
# download tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")

# add required special tokens to the tokenizer
tokenizer = utils.process_tokenizer(tokenizer)

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer):
        self.y = df["human_review"]
        self.code = df["diff_hunk"]
        self.x = torch.tensor(df.apply(lambda row: utils.encode_diff(tokenizer, row["diff_hunk"], '', ''), axis=1), dtype=torch.long).cpu()
 
    def __len__(self):
        return len(self.y)
   
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]

## 2 Load data
Here we load the data and create a dataloader for each project.

In [None]:
filenames = ['../data/msg-test.csv', 'JetBrains_kotlin_1000.csv', 'microsoft_vscode_1000.csv', 'transloadit_uppy_1000.csv']

datasets = []
dataloaders = []
for filename in filenames:
    df = pd.read_csv(filename)
    dataset = ReviewsDataset(df, tokenizer)
    datasets.append(dataset)
    dataloader = DataLoader(dataset, batch_size=4, shuffle=False) # batch_size=6 for 8GB GPU
    dataloaders.append(dataloader)

## 3 Predict

Now we can generate code reviews for each project. We will use two models:
- Pre-trained model from HuggingFace provided by the authors of {cite}`li2022codereviewer`
- Fine-tuned model on the CodeReviewer dataset

### Predict function

In [None]:
def predict(model, dataloader, device='cuda'):
    model = model.to(device)
    model.eval()
    
    result = []
    
    for X, y in tqdm(dataloader):
        inputs_mask = X.ne(tokenizer.pad_id)
        preds = model.generate(
            X.to(device),
            attention_mask=inputs_mask.to(device),
            use_cache=True,
            num_beams=5,
            early_stopping=True,
            max_length=512,
            num_return_sequences=1,
        )
        preds_np = preds.detach().cpu().numpy()
        preds_decoded = np.apply_along_axis(lambda row: tokenizer.decode(
            row[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False
        ), 1, preds_np)
        result += list(preds_decoded)
    return result

### HuggingFace pre-trained checkpoint

The model is available on the HuggingFace model hub: https://huggingface.co/microsoft/codereviewer

In [None]:
# download the pretrained model from huggingface
hf_model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/codereviewer")

for filename, dataset, dataloader in zip(filenames, datasets, dataloaders):
    preds = predict(hf_model, dataloader)
    df_pred = pd.DataFrame({'code': dataset.code, 'target': dataset.y, 'prediction': preds})
    df_pred.to_csv(Path(filename).with_suffix('.hf_pred.csv'))
    df_pred.head()

### Fine-tuned CodeReviewer

I fine-tuned the model on the CodeReviewer dataset on the `msg` task using the [instructions](https://github.com/microsoft/CodeBERT/tree/master/CodeReviewer#3-finetuneinference) from the authors of {cite}`li2022codereviewer`.

For the fine-tuning I used the following parameters:
- `batch_size=6`
- `learning_rate=3e-4`
- `max_source_length=512`

The execution took about 12 hours on a single NVIDIA GeForce A100 GPU. The model was fine-tuned for 3 epochs.

I have made the checkpoint available on the HuggingFace model hub: https://huggingface.co/waleko/codereviewer-finetuned-msg

In [None]:
# download the fine-tuned model
ft_model = AutoModelForSeq2SeqLM.from_pretrained("waleko/codereviewer-finetuned-msg")

for filename, dataset, dataloader in zip(filenames, datasets, dataloaders):
    preds = predict(ft_model, dataloader)
    df_pred = pd.DataFrame({'code': dataset.code, 'target': dataset.y, 'prediction': preds})
    df_pred.to_csv(Path(filename).with_suffix('.finetuned_pred.csv'))
    df_pred.head()