<a href="https://colab.research.google.com/github/uDivy/CodeLings/blob/development/CodeLings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports


In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
[0mCollecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.9 MB/s[0m eta [36m0:

In [8]:
# Import necessary libraries
import tensorflow as tf
from transformers import AutoModel, AutoTokenizer
from datasets import load_dataset
import pandas as pd
import numpy as np
from tqdm import tqdm
import torch

# Set up project



In [9]:
# Step 1: Set up the pre-trained model and tokenizer
checkpoint = "Salesforce/codet5p-220m-bimodal"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True)
model = AutoModel.from_pretrained(checkpoint, trust_remote_code=True).to(device)

In [10]:
# Step 2: Download nl2bash-custom dataset
dataset = load_dataset("AnishJoshi/nl2bash-custom")

# Step 3: Explore the dataset
# Convert the dataset to pandas DataFrame for exploration
df_train = pd.DataFrame(dataset['train'])
df_valid = pd.DataFrame(dataset['validation'])
df_test = pd.DataFrame(dataset['test'])

# Print dataset structure
print("Dataset structure:")
print(df_train.info())

# Print size of each split
print(f"Train size: {len(df_train)}")
print(f"Validation size: {len(df_valid)}")
print(f"Test size: {len(df_test)}")

Repo card metadata block was not found. Setting CardData to empty.


Dataset structure:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19658 entries, 0 to 19657
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   bash_code   19658 non-null  object
 1   nl_command  19658 non-null  object
 2   srno        19658 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 460.9+ KB
None
Train size: 19658
Validation size: 2457
Test size: 2458


In [30]:
# Step 4: Define Mean Reciprocal Rank (MRR) metric
def mrr_score(y_true, y_pred):
    """
    Calculate Mean Reciprocal Rank (MRR) given the true and predicted labels.
    Args:
        y_true (list of str): List of true target values (bash commands).
        y_pred (list of str): List of predicted target values (bash commands).
    Returns:
        float: MRR score.
    """
    mrr_total = 0.0
    for true, pred in zip(y_true, y_pred):
        # Convert true and pred to tokenized form
        true_tokenized = tokenizer(true, return_tensors="pt").input_ids.to(device)
        pred_tokenized = tokenizer(pred, return_tensors="pt").input_ids.to(device)

        # Find the minimum length between the two sequences
        min_len = min(true_tokenized.size(1), pred_tokenized.size(1))

        # Compare token sequences up to the minimum length
        rank = (true_tokenized[:, :min_len] == pred_tokenized[:, :min_len]).nonzero(as_tuple=True)[1] + 1

        if len(rank) > 0:
            mrr_total += 1 / rank[0].item()

    return mrr_total / len(y_true)

# Step 5: Evaluate the pre-trained model on the dataset using MRR
def evaluate_model(dataset, model, tokenizer):
    """
    Evaluate the model on the dataset using Mean Reciprocal Rank (MRR).
    Args:
        dataset: Dataset to evaluate the model on.
        model: Pre-trained model.
        tokenizer: Pre-trained tokenizer.
    Returns:
        MRR score.
    """
    y_true = []
    y_pred = []

    # Iterate through the first 50 samples in the dataset
    for index, row in tqdm(dataset.iterrows(), total=len(dataset)):
        input_text = row['nl_command']
        true_output = row['bash_code']

        # Tokenize the input and generate predictions
        input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
        outputs = model.generate(input_ids)
        predicted_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

        y_true.append(true_output)
        y_pred.append(predicted_output)

        if len(y_true) >= 50:  # Restrict to the first 50 test cases
            break

    # Calculate MRR
    mrr = mrr_score(y_true, y_pred)
    return mrr

# Step 6: Evaluate on the test set (limit to the first 50 rows)
test_dataset = df_test.head(50)  # Restrict to first 50 samples

mrr_result = evaluate_model(test_dataset, model, tokenizer)
print(f"\nMRR on the first 50 test cases: {mrr_result:.4f}")

 98%|█████████▊| 49/50 [01:21<00:01,  1.66s/it]

MRR on the first 50 test cases: 1.0000





In [29]:
import random

# Function to randomly pick an nl_command, generate the bash_code from the model, and compare with the original
def random_nl_command_evaluation(test_dataset, model, tokenizer):
    # Choose a random index from the test dataset
    random_index = 1600 # random.randint(0, len(test_dataset) - 1)

    # Extract the nl_command and the corresponding original bash_code
    random_sample = test_dataset.iloc[random_index]
    input_text = random_sample['nl_command']
    original_bash_code = random_sample['bash_code']

    # Tokenize the input nl_command and generate the bash_code using the model
    input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to(device)
    outputs = model.generate(input_ids)
    generated_bash_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Print the original and generated bash_code
    print(f"Random NL Command: {input_text}")
    print(f"Original Bash Code: {original_bash_code}")
    print(f"Generated Bash Code: {generated_bash_code}")

# Call the function with the test dataset
random_nl_command_evaluation(df_test, model, tokenizer)



Random NL Command: Search all regular files in the current directory for "example"
Original Bash Code: find -maxdepth 1 -type f | xargs grep -F 'example'
Generated Bash Code: def search_example ( self, example_path ) : for path in os. listdir
