<a href="https://colab.research.google.com/github/veerumehta/FineTuner/blob/main/run_dataload_template.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!git clone https://github.com/soodrohit/colab.git
%pip install transformers==4.26.1
%pip install sentencepiece
%pip install Timer

Cloning into 'colab'...
fatal: could not read Username for 'https://github.com': No such device or address
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import torch
import os
import time

os.environ["HF_ENDPOINT"] = "https://huggingface.co"
from typing import List
from typing import Any
from dataclasses import dataclass
from transformers import PLBartTokenizer, PLBartForConditionalGeneration


FAIRSEQ_LANGUAGE_CODES_MAP = {
    "java": "__java__",
    "python": "__python__",
    "en_XX": "__en_XX__",
    "javascript": "__javascript__",
    "php": "__php__",
    "ruby": "__ruby__",
    "go": "__go__",
}

LANGUAGE_MODEL_MAP = {
    "java": "uclanlp/plbart-java-en_XX",
    "python": "uclanlp/plbart-python-en_XX",
    "en_XX": "uclanlp/plbart-multi_task-strong",
}

@dataclass
class TranslateInfo:
    input_file: str
    input_text: str
    input_length: int
    output_text: str
    output_length: int
    model_name: str
    src_lang: str
    tgt_lang: str
    time_taken_ms: float

    def __init__(self, input_file) -> None:
        self.input_file = input_file
        
    def __iter__(self):
        yield from {
            "input_file": self.input_file,
            "input": self.input_text,
            "output": self.output_text,
            "input_length": self.input_length,
            "output_length": self.output_length,
            "model_name": self.model_name, 
            "src_lang": self.src_lang,
            "tgt_lang": self.tgt_lang,
            "time_taken_ms": self.time_taken_ms
        }.items()
    
    
    def to_json(self):
        to_return = {
                     "input_file": self.input_file,
                     "input": self.input_text, 
                     "output_length": self.output_length,
                     "output_length": self.output_length,
                     "input_length": self.input_length,
                     "model_name": self.model_name, 
                     "src_lang": self.src_lang,
                     "tgt_lang": self.tgt_lang,
                     "time_taken_ms": self.time_taken_ms
                     }
        return to_return

def load_code_text(input_file: str) -> str:
  input_code = ""
  with open(input_file) as f:
    for line in f.readlines():
      code_part = (line.replace("\r","").strip())
      input_code = input_code + code_part
    return input_code

def convert_lang_code_special_format(lang: str) -> str:
        """Convert Language Codes to format tokenizer uses if required"""
        lang = FAIRSEQ_LANGUAGE_CODES_MAP[lang] if lang in FAIRSEQ_LANGUAGE_CODES_MAP.keys() else lang
        return lang

def load_model_and_tokenizer(model_name_or_path, src_lang, tgt_lang):
    tokenizer = PLBartTokenizer.from_pretrained(model_name_or_path, src_lang=src_lang, tgt_lang=tgt_lang)
    model = PLBartForConditionalGeneration.from_pretrained(model_name_or_path)
    return model, tokenizer

def translate(
        model_name_or_path,
        input_sequences,
        src_lang=None,
        tgt_lang=None,
        max_generation_length=128,
        num_beams=10,
        num_return_sequences=1
):
    model, tokenizer = load_model_and_tokenizer(model_name_or_path, src_lang, tgt_lang)
    if src_lang:
        tokenizer.src_lang = src_lang
    decoder_start_token_id = None
    if tgt_lang:
        # print(f"target language: {convert_lang_code_special_format(tgt_lang)}")
        decoder_start_token_id = tokenizer.lang_code_to_id[convert_lang_code_special_format(tgt_lang)]
    inputs = tokenizer(input_sequences, return_tensors='pt', padding=True)
    outputs = model.generate(
        **inputs,
        decoder_start_token_id=decoder_start_token_id,
        max_length=max_generation_length,
        num_beams=num_beams,
        num_return_sequences=num_return_sequences,
    )
    return tokenizer.batch_decode(outputs, skip_special_tokens=True)

def do_translation(request_info: TranslateInfo) -> TranslateInfo:
  src_lang, tgt_lang, input_file = request_info.src_lang, \
                                   request_info.tgt_lang,  \
                                   request_info.input_file

  model_name_or_path = LANGUAGE_MODEL_MAP[src_lang]
  request_info.model_name = model_name_or_path
  request_info.input_text = load_code_text(input_file)
  request_info.input_length = len(request_info.input_text)
  print(f"Input code for summarization: {request_info.input_text}")
  outputs = translate(
      request_info.model_name, [request_info.input_text], 
      request_info.src_lang, request_info.tgt_lang
  )
  request_info.output_text = "\n".join(outputs)
  request_info.output_length = len(request_info.output_text)
  return request_info

def get_code_files(path: str) -> list:
  fname = []
  for root,d_names,f_names in os.walk(path):
    for f in f_names:
      filepath = os.path.join(root, f)
      fname.append(filepath)
  return fname

In [3]:
def write_perf_output(requests: list) -> str:
  run_file_name = "sample_data/test.log"
  with open(run_file_name, mode="a") as f:
    f.write(
            '^'.join([ 
                      "input_file_name",
                      "time_taken_ms",
                      "input_length",
                      "model_name",
                      "output_text",
                      "output_length",
                      "src_lang",
                      "tgt_lang",
                      "input_text",
                      str("\n")
                    ]
                )
            )
    for info in requests:
      f.write(
          '^'.join(
                    [
                      info.input_file,
                      str(info.time_taken_ms),
                      str(info.input_length),
                      info.model_name,
                      info.output_text,
                      str(info.output_length),
                      info.src_lang,
                      info.tgt_lang,
                      info.input_text,
                      str("\n")
                    ]
                )
          )
    return run_file_name

In [None]:
translate_requests = []
code_file_path = "colab/data/code"
tgt_lang = "en_XX"
fname = get_code_files(code_file_path)
for input_file in fname:
  request_info = TranslateInfo(input_file)
  request_info.src_lang = os.path.basename(os.path.dirname(input_file))
  request_info.tgt_lang = tgt_lang
  request_info.time_taken_ms = 0
  start_time = time.perf_counter()
  request_info = do_translation(request_info)
  end_time = time.perf_counter()
  request_info.time_taken_ms = (end_time - start_time)*1000
  translate_requests.append(request_info)

print(f"output logged in: {write_perf_output(translate_requests)}")

Input code for summarization: public class Largest {public static void main(String[] args) {double n1 = -4.5, n2 = 3.9, n3 = 2.5;if( n1 >= n2 && n1 >= n3)System.out.println(n1 + " is the largest number.");else if (n2 >= n1 && n2 >= n3)System.out.println(n2 + " is the largest number.");elseSystem.out.println(n3 + " is the largest number.");}}
Input code for summarization: # Python Program to calculate the square root# Note: change this value for a different resultnum = 8# To take the input from the user#num = float(input('Enter a number: '))num_sqrt = num ** 0.5print('The square root of %0.3f is %0.3f'%(num ,num_sqrt))
output logged in: sample_data/test.log


In [None]:
!cat sample_data/test.log

# Code T5 Integration

In [5]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
# Python Program to calculate the square root

# Note: change this value for a different result
num = 8 

# To take the input from the user
#num = float(input('Enter a number: '))

num_sqrt = num ** 0.5
print('The square root of %0.3f is %0.3f'%(num ,num_sqrt))
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

This function is a utility function to calculate the square root of a n - tuple in a


In [6]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large-ntp-py")
text = """#This function is a utility function to calculate the square root of a n. 
           def calculate_sq_root_number(number: int):
             <extra_id_0>"""
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/511k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/11.3k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.48G [00:00<?, ?B/s]


                return int(sqrt(number))



In [7]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
# Note: change this value for a different result
num = 8 

# To take the input from the user
#num = float(input('Enter a number: '))

num_out = num ** 0.5
print('The output for %0.3f is %0.3f'%(num ,num_out))
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

This function takes the input from the user and checks if the user has a missing value in


In [8]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
# Note: change this value for a different result
num = 8 

# To take the input from the user
#num = float(input('Enter a number: '))

num_sqrt = num ** 0.5
print('The output for %0.3f is %0.3f'%(num ,num_sqrt))
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

This function takes the input from the user and asks the user to enter the number of


In [9]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
# Note: change this value for a different result
num = 8 

# To take the input from the user
#num = float(input('Enter a number: '))

num_sqrt = num ** 0.5
print('The square root of %0.3f is %0.3f'%(num ,num_sqrt))
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

This function takes the input from the user and checks if the user has the neccess


In [10]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
# Python Program to calculate the square root

# Note: change this value for a different result
num = 8 

# To take the input from the user
#num = float(input('Enter a number: '))

num_out = num ** 0.5
print('The output for %0.3f is %0.3f'%(num ,num_out))
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

This function is a wrapper around the square root
num function that takes a user input and


In [11]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large-ntp-py")
text = """#This function is a wrapper around the square root num function that takes a user input. 
             <extra_id_0>"""
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=250)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

:
#https://www.hackerrank.com/challenges/square-root-num/problem

def square_root_num(num):
    return int(math.sqrt(num))

def main():
    num = int(input("Enter a number: "))
    print(square_root_num(num))

main()



In [12]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration
tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')
text = """
def square_root_num(num):
    return int(math.sqrt(num))

def main():
    num = int(input("Enter a number: "))
    print(square_root_num(num))

main()
"""

input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=20)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

Calculate square root number and print it if it is not in range.


In [13]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large-ntp-py")
text = """#Calculate square root number and print it if it is not in range. 
             <extra_id_0>"""
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=250)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

:

def squareRoot(n):
    if n == 0:
        return 0
    if n == 1:
        return 1
    if n % 2 == 0:
        return squareRoot(n/2) + 1
    else:
        return squareRoot(n/2) + squareRoot(n%2)

print(squareRoot(100))



In [14]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codet5-large-ntp-py")
model = T5ForConditionalGeneration.from_pretrained("Salesforce/codet5-large-ntp-py")
text = """#Calculate square root number and print it. 
             <extra_id_0>"""
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=250)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))

:

def squareRoot(n):
	if n == 0:
		return 0
	else:
		return n * n

print squareRoot(100)



# Salesforce CodeGen Integration

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
tokenizer = AutoTokenizer.from_pretrained("Salesforce/codegen-350M-multi")
model = AutoModelForCausalLM.from_pretrained("Salesforce/codegen-350M-multi")

text = "def hello_world():"
input_ids = tokenizer(text, return_tensors="pt").input_ids

generated_ids = model.generate(input_ids, max_length=128)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))