### Imports

In [1]:
import os
import json
import regex as re
import pandas as pd

In [2]:
train_filename = "train.jsonl"
valid_filename = "valid.jsonl"
test_filename = "test.jsonl"

file_path = "./dataset/Cleaned_CodeSearchNet/CodeSearchNet-Python/" # Change this to your own path

### Loading Data

In [26]:
def load_data(file_path, file_name):
    compiled = []
    # Regular expression pattern to remove docstrings from Python functions: matches both """[docstring]""" and '''[docstring]'''
    docstring_pattern  = r'"""[\s\S]*?"""|\'\'\'[\s\S]*?\'\'\''
    with open(f"{file_path}{file_name}", 'r') as file:
        for line in file:
            # Load JSON data to convert the string into a dictionary
            data = json.loads(line)
            if len(data['code_tokens']) < 256 and len(data['docstring_tokens']) < 256:
                # Use the re.sub function to replace the matched pattern with an empty string
                cleaned_code_text = re.sub(docstring_pattern, '', data['code'].strip())        
                compiled.append({'code': cleaned_code_text, 'docstring': data['docstring'].strip()})

    code_corpus = [data['code'] for data in compiled]
    docstrings_corpus = [data['docstring'] for data in compiled]
    
    return code_corpus, docstrings_corpus


In [27]:
train_code_corpus, train_docstrings_corpus = load_data(file_path, train_filename)
valid_code_corpus, valid_docstrings_corpus = load_data(file_path, valid_filename)
test_code_corpus, test_docstrings_corpus = load_data(file_path, test_filename)

### Creating Prompt, which is the training data for the model

In [13]:
def create_text_dataframe(code_string, docstring):
    df =  pd.DataFrame([code_string, docstring], index=['code', 'docstring']).T
    prompt = "Below is the code of a Python function. Please write a short comment describing what the function does. \n\n"
    df['text'] = prompt + "### Instruction:\n" + df['code'] + "\n### Response:\n" + df['docstring']

    return df

In [24]:
train = create_text_dataframe(train_code_corpus, train_docstrings_corpus)
test = create_text_dataframe(test_code_corpus, test_docstrings_corpus)
valid = create_text_dataframe(valid_code_corpus, valid_docstrings_corpus)

### Saving the datasets to CSV

In [None]:
train.to_csv('finetune_llama-v2/prompt_data/train.csv', index=False)
test.to_csv('finetune_llama-v2/prompt_data/test.csv', index=False)
valid.to_csv('finetune_llama-v2/prompt_data/valid.csv', index=False)