# CodeReviewer Model Inference

Let's generate code reviews using `microsoft/codereviewer` model {cite}`li2022codereviewer`

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from tqdm.autonotebook import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import utils

## 1 Load data  

In [None]:
filename = "../data/JetBrains_kotlin_100.csv"

In [None]:
df = pd.read_csv(filename)
df.head()

In [None]:
# download tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained("microsoft/codereviewer")

# add required special tokens to the tokenizer
tokenizer = utils.process_tokenizer(tokenizer)

In [None]:
class ReviewsDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer):
        self.y = df["human_review"]
        self.code = df["diff_hunk"]
        self.x = torch.tensor(df.apply(lambda row: utils.encode_diff(tokenizer, row["diff_hunk"], row["msg"], row["src_file"]), axis=1), dtype=torch.long).cpu()
 
    def __len__(self):
        return len(self.y)
   
    def __getitem__(self,idx):
        return self.x[idx], self.y[idx]


In [None]:
dataset = ReviewsDataset(df, tokenizer)
dataloader = DataLoader(dataset, batch_size=4)

In [None]:
def predict(model, dataloader, device='cuda'):
    model = model.to(device)
    model.eval()
    
    result = []
    
    for X, y in tqdm(dataloader):
        inputs_mask = X.ne(tokenizer.pad_id)
        preds = model.generate(
            X.to(device),
            attention_mask=inputs_mask.to(device),
            use_cache=True,
            num_beams=5,
            early_stopping=True,
            max_length=512,
            num_return_sequences=1,
        )
        preds_np = preds.detach().cpu().numpy()
        preds_decoded = np.apply_along_axis(lambda row: tokenizer.decode(
            row[2:], skip_special_tokens=True, clean_up_tokenization_spaces=False
        ), 1, preds_np)
        result += list(preds_decoded)
    return result

## 2 Predict

### HuggingFace pre-trained checkpoint

In [None]:
# download the pretrained model from huggingface
hf_model = AutoModelForSeq2SeqLM.from_pretrained("microsoft/codereviewer")

In [None]:
preds = predict(hf_model, dataloader)

In [None]:
df_pred = pd.DataFrame({'code': dataset.code, 'target': dataset.y, 'prediction': preds})

In [None]:
df_pred.head()

In [None]:
df_pred.to_csv(Path(filename).with_suffix('.hf_pred.csv'))

### Fine-tuned CodeReviewer