In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Globals
SourceCodePath = "https://github.com/networkx/networkx"
DataStorePath = "/content/drive/MyDrive/NLP-Project/data"
ModelStorePath = "/content/drive/MyDrive/NLP-Project/finetuned-models"
PretrainedPath = "/content/drive/MyDrive/NLP-Project/pretrained"

In [None]:
# Wrapper classes

from transformers import RobertaTokenizer, T5ForConditionalGeneration, AutoTokenizer
import torch
import logging

pretrained = lambda lang : f"{PretrainedPath}/summarize_{lang}_codet5_base.bin"

class T5Model:
    """
    Loads the codeT5 base multilingual summarization model.
    Additionally, if a language name is provided, checks if pretrained weights for that langauges is available and loads it.
    """

    def __init__(self, lang = None):
        self.tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
        self.model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base-multi-sum')

        logging.info("[T5Model] Loaded Salesforce/codet5-base-multi-sum and tokenizer")

        if lang != None:
            try:
                self.model.load_state_dict(torch.load(pretrained(lang)))
                logging.info(f"Loaded model for {lang}")
            except Exception as e:
                logging.error(e)

    def predict(self,text):
        logging.debug(f"[{self.__class__.__name__}] Predicting for sequence: \n{text}")

        input_ids = self.tokenizer(text, return_tensors="pt").input_ids

        generated_ids = self.model.generate(input_ids, max_length=512)
        return self.tokenizer.decode(generated_ids[0], skip_special_tokens=True)


class T5_plus(T5Model):
    """
    Load the T5_plus model
    """

    def __init__(self):
        checkpoint = "Salesforce/codet5p-220m-py"
        self.tokenizer = AutoTokenizer.from_pretrained(checkpoint)
        self.model = T5ForConditionalGeneration.from_pretrained(checkpoint)

        logging.info(f"[T5_plus] Loaded model and tokenizer at {checkpoint}")


In [None]:
class T5Model_Pretrained(T5Model):
    """
    Load a T5Model from a checkpoint
    """
    def __init__(self, model_path):
        self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
        self.model = T5ForConditionalGeneration.from_pretrained(model_path)

        logging.info(f"[T5Model_Pretrained] Loaded model and tokenizer at {model_path}")

# Task 1: Docstring generation


Training the base codeT5 model on the netowrkx dataset

In [None]:
import torch

import json
import logging
import os
import sys
import time

FORMAT = "[%(levelname)-8s][%(asctime)s][%(filename)s:%(lineno)s - %(funcName)13s()] %(message)s"
logging.basicConfig(format=FORMAT, stream=sys.stdout, encoding='utf-8', level=logging.DEBUG)


def TokenizePair(data_pair, tokenizer):

    input_text = f"Docstring Gen: {data_pair['code']} </s>"
    target_text = f"{data_pair['docstring']} </s>"

    encoded = tokenizer(
        input_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    decoded = tokenizer(
        target_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoded.input_ids.flatten()
    output_ids = decoded.input_ids.flatten()

    return {
        'input_ids': input_ids,
        'labels' : output_ids
    }

# Loading dataset
class CustomIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer

    def __iter__(self):
        for file_path in self.file_paths:
            logging.debug(f"loading {file_path}")
            with open(file_path, 'r') as file:
                data = json.load(file)
                for data_pair in data:
                    tokenizedPair = TokenizePair(data_pair, self.tokenizer)
                    tokenizedPair["file"] = file_path
                    yield tokenizedPair


logging.info("Loading model")
t5model = T5Model()

logging.info("Defining dataset")
jsons = [f"{DataStorePath}/{f}" for f in os.listdir(DataStorePath)]
train_dataset = CustomIterableDataset(jsons, t5model.tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)


model = t5model.model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

logging.info("Training")
# Fine-tuning the model
num_epochs = 10
for epoch in range(num_epochs):
    for batch in train_loader:
      try:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"File {batch['file']} Loss : {loss.item()}")
        logging.debug(f"File {batch['file']} Loss : {loss.item()}")
      except Exception as e:
        logging.error(e)
      except:
        logging.error("Didn't catch that")

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")
    logging.info(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")

    timestamp = time.time()
    model.save_pretrained(f"{ModelStorePath}/fine_tuned_model.{timestamp}")
    t5model.tokenizer.save_pretrained(f"{ModelStorePath}/fine_tuned_model.{timestamp}")

logging.info("Saving")
# Save the model after training
model.save_pretrained(f"{ModelStorePath}/fine_tuned_model")
t5model.tokenizer.save_pretrained(f"{ModelStorePath}/fine_tuned_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.06389087438583374
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.08874723315238953
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.00207601860165596
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.006429915316402912
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.003044565673917532
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.005995127372443676
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.0015464125899598002
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.004033270757645369
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.035808417946100235
File ['/content/drive/MyDrive/NLP-Project/data/algorithms.json'] Loss : 0.013969729654490948
File ['

('/content/drive/MyDrive/NLP-Project/finetuned-models/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/fine_tuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/fine_tuned_model/vocab.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/fine_tuned_model/merges.txt',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/fine_tuned_model/added_tokens.json')

Training the fine tuned codeT5 (fine tuned on the downstream task of generating python docstrings using function names) model on the networkx dataset

In [None]:
import torch

import json
import logging
import os
import sys
import time

FORMAT = "[%(levelname)-8s][%(asctime)s][%(filename)s:%(lineno)s - %(funcName)13s()] %(message)s"
logging.basicConfig(format=FORMAT, stream=sys.stdout, encoding='utf-8', level=logging.DEBUG)
modelStorePath = ModelStorePath + "/python"

def TokenizePair(data_pair, tokenizer):

    input_text = f"Docstring Gen: {data_pair['code']} </s>"
    target_text = f"{data_pair['docstring']} </s>"

    encoded = tokenizer(
        input_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    decoded = tokenizer(
        target_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoded.input_ids.flatten()
    output_ids = decoded.input_ids.flatten()

    return {
        'input_ids': input_ids,
        'labels' : output_ids
    }

# Loading dataset
class CustomIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer

    def __iter__(self):
        for file_path in self.file_paths:
            logging.debug(f"loading {file_path}")
            with open(file_path, 'r') as file:
                data = json.load(file)
                for data_pair in data:
                    tokenizedPair = TokenizePair(data_pair, self.tokenizer)
                    tokenizedPair["file"] = file_path
                    yield tokenizedPair


logging.info("Loading python downstream model")
t5model = T5Model('python') # This is the downstream model for python summarization

logging.info("Defining dataset")
jsons = [f"{DataStorePath}/{f}" for f in os.listdir(DataStorePath)]
train_dataset = CustomIterableDataset(jsons, t5model.tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)


model = t5model.model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

logging.info("Training")
# Fine-tuning the model
num_epochs = 10
for epoch in range(num_epochs):
    for batch in train_loader:
      try:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"File {batch['file']} Loss : {loss.item()}")
        logging.debug(f"File {batch['file']} Loss : {loss.item()}")
      except Exception as e:
        logging.error(e)
      except:
        logging.error("Didn't catch that")

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")
    logging.info(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")

    timestamp = time.time()
    model.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")
    t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")

logging.info("Saving")
# Save the model after training
model.save_pretrained(f"{modelStorePath}/fine_tuned_model")
t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model")

File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.0030279899947345257
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.0008142722072079778
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.1275753527879715
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.003021589247509837
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.04297548532485962
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.017853016033768654
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.02168833650648594
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.05207350105047226
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.0005080531118437648
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.00045068396138958633
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.04355548322200775
File ['/content/drive/

('/content/drive/MyDrive/NLP-Project/finetuned-models/python/fine_tuned_model/tokenizer_config.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/python/fine_tuned_model/special_tokens_map.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/python/fine_tuned_model/vocab.json',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/python/fine_tuned_model/merges.txt',
 '/content/drive/MyDrive/NLP-Project/finetuned-models/python/fine_tuned_model/added_tokens.json')

In [None]:
## Testing using examples

t5_python_model = T5Model('python')
finalModel_base = T5Model_Pretrained(ModelStorePath + "/fine_tuned_model")
finalModel_python = T5Model_Pretrained(ModelStorePath + "/python/fine_tuned_model")

inp = "@not_implemented_for('multigraph')\n@nx._dispatch\ndef peak_number(G, cores=None):\n    if nx.number_of_selfloops(G) > 0:\n        msg = 'Input graph has self loops which is not permitted; Consider using G.remove_edges_from(nx.selfloop_edges(G)).'\n        raise NetworkXError(msg)\n    if cores is None:\n        cores = nx.core_number(G)\n    H = G.copy()\n    peak_number = {}\n    while H.nodes():\n        core_degeneracy = nx.k_core(H, core_number=cores)\n        peak_number.update({node: cores[node] for node in core_degeneracy})\n        H.remove_nodes_from(core_degeneracy.nodes())\n        cores = nx.core_number(H)\n    return peak_number"
out_t5 = t5_python_model.predict("Docstring Gen: " + inp)
out_base = finalModel_base.predict("Docstring Gen: " + inp)
out_python = finalModel_python.predict("Docstring Gen: " + inp)

print(out_t5)
print("---------------------------------")
print("---------------------------------")
print("---------------------------------")
print(out_base)
print("--------------------------------")
print("--------------------------------")
print("--------------------------------")
print(out_python)


ERROR:root:Attempting to deserialize object on a CUDA device but torch.cuda.is_available() is False. If you are running on a CPU-only machine, please use torch.load with map_location=torch.device('cpu') to map your storages to the CPU.


Compute peak number of nodes in a network.
---------------------------------
---------------------------------
---------------------------------
Returns the peak number of each node.

The peak number of a node in a graph is

.. math::

   A connected graph with no self loops should
be computed by adding the self loops.

Parameters
----------
G : NetworkX graph

cores : dictionary (default=None)
    Precomputed core numbers for the graph. If None, it will be
    computed.

Returns
-------
Pal number
    The peak number of all nodes in the graph.

Raises
------
NetworkXError
    If `G` is a multigraph or if
    `cores` is None.

 
--------------------------------
--------------------------------
--------------------------------
Returns the number of edges that disconnects G.

A number of edges is a subset of the nodes of the graph which consists
of self loops. A self loop consists of self loops on either the left or right,
other nodes of the graph. A self loop consists of
the nodes of th

# Task 2: Code generation from Docstring

In [None]:
import torch

import json
import logging
import os
import sys
import time

FORMAT = "[%(levelname)-8s][%(asctime)s][%(filename)s:%(lineno)s - %(funcName)13s()] %(message)s"
logging.basicConfig(format=FORMAT, stream=sys.stdout, encoding='utf-8', level=logging.DEBUG)
modelStorePath = ModelStorePath + "/generation-from-docstring"


def TokenizePair(data_pair, tokenizer):

    input_text = f"Code Gen: {data_pair['docstring']} </s>"
    target_text = f"{data_pair['code']} </s>"

    encoded = tokenizer(
        input_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    decoded = tokenizer(
        target_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoded.input_ids.flatten()
    output_ids = decoded.input_ids.flatten()

    return {
        'input_ids': input_ids,
        'labels' : output_ids
    }

# Loading dataset
class CustomIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer

    def __iter__(self):
        for file_path in self.file_paths:
            logging.debug(f"loading {file_path}")
            with open(file_path, 'r') as file:
                data = json.load(file)
                for data_pair in data:
                    tokenizedPair = TokenizePair(data_pair, self.tokenizer)
                    tokenizedPair["file"] = file_path
                    yield tokenizedPair


logging.info("Loading model")
t5model = T5Model()

logging.info("Defining dataset")
jsons = [f"{DataStorePath}/{f}" for f in os.listdir(DataStorePath)]
train_dataset = CustomIterableDataset(jsons, t5model.tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)


model = t5model.model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

logging.info("Training")
# Fine-tuning the model
num_epochs = 5
for epoch in range(num_epochs):
    for batch in train_loader:
      try:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"File {batch['file']} Loss : {loss.item()}")
        logging.debug(f"File {batch['file']} Loss : {loss.item()}")
      except Exception as e:
        logging.error(e)
      except:
        logging.error("Didn't catch that")

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")
    logging.info(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")

    timestamp = time.time()
    model.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")
    t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")

logging.info("Saving")
# Save the model after training
model.save_pretrained(f"{modelStorePath}/fine_tuned_model")
t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model")

File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 3.406301259994507
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 1.4591442346572876
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 5.467596054077148
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 5.8901872634887695
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 2.303309679031372
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 1.7118998765945435
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 2.641272783279419
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.7780472636222839
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 1.1538182497024536
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.4273897409439087
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.4280437231063843
File ['/content/drive/MyDrive/NLP-Project/data/

In [None]:
## Testing using examples

t5_python_model = T5Model()
finalModel_base = T5Model_Pretrained(ModelStorePath + "/generation-from-docstring/fine_tuned_model.1714661974.8295977")

inp = "Returns the peak number for each vertex.\n\nA k-peak of a graph is a maximal subgraph that contains nodes of degree k or more within each other.\n\nParameters\n----------\nG : NetworkX graph\n   A graph or directed graph\ncores : dictionary, optional\n  Precomputed core numbers for the graph G.\n\nReturns\n-------\npeak_number : dictionary\n   A dictionary keyed by node to the peak number.\n\nRaises\n------\nNetworkXError\n    The k-peak is not implemented for graphs with self loops\n    or parallel edges.\n\n"
out_t5 = t5_python_model.predict("Generate Python: " + inp)
out_base = finalModel_base.predict("Code Gen: " + inp)

print(out_t5)
print("---------------------------------")
print("---------------------------------")
print("---------------------------------")
print(out_base)
print("--------------------------------")
print("--------------------------------")
print("--------------------------------")


Returns the peak number for each node in a graph.
---------------------------------
---------------------------------
---------------------------------
def k_peak(G, cores=None):
    if G.is_directed()!= nx.is_directed():
        msg = 'Input graph is not directed and is not strongly connected.'
        raise NetworkXError(msg)
    if G.is_directed()!= nx.is_directed()!= nx.is_directed()!= nx.is_directed()!= nx.is_directed()!= nx.is_directed()!= nx.is_connected(G):
        msg = 'Input graph is not connected.'
        raise NetworkXError(msg)
    return {v: 0 for v in G.nodes() if v in G.degree() > 0} 
--------------------------------
--------------------------------
--------------------------------


# Task 3: Next line prediction

Training the base T5 model on the the downstream task of next line prediction

In [None]:
import torch

import json
import logging
import os
import sys
import time

FORMAT = "[%(levelname)-8s][%(asctime)s][%(filename)s:%(lineno)s - %(funcName)13s()] %(message)s"
logging.basicConfig(format=FORMAT, stream=sys.stdout, encoding='utf-8', level=logging.DEBUG)
modelStorePath = ModelStorePath + "/generation"

def TokenizePair(data_pair, tokenizer):

  sentances = data_pair['code'].split("\n")

  for i in range(1, len(sentances)):
    sen = "\n".join(sentances[:i])
    input_text = f"Code Gen: {sen} </s>"
    target_text = f"{sentances[i]} </s>"

    encoded = tokenizer(
        input_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    decoded = tokenizer(
        target_text,
        # pad_to_max_length=True,
        padding='max_length',
        truncation=True,
        max_length=512,
        return_tensors='pt'
    )

    input_ids = encoded.input_ids.flatten()
    output_ids = decoded.input_ids.flatten()

    yield {
        'input_ids': input_ids,
        'labels' : output_ids
    }

# Loading dataset
class CustomIterableDataset(torch.utils.data.IterableDataset):
    def __init__(self, file_paths, tokenizer):
        self.file_paths = file_paths
        self.tokenizer = tokenizer

    def __iter__(self):
        for file_path in self.file_paths:
            logging.debug(f"loading {file_path}")
            with open(file_path, 'r') as file:
                data = json.load(file)
                for data_pair in data:
                    for tokenizedPair in TokenizePair(data_pair, self.tokenizer):
                      tokenizedPair["file"] = file_path
                      yield tokenizedPair


logging.info("Loading model")
t5model = T5Model()

logging.info("Defining dataset")
jsons = [f"{DataStorePath}/{f}" for f in os.listdir(DataStorePath)]
train_dataset = CustomIterableDataset(jsons, t5model.tokenizer)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1)


model = t5model.model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

logging.info("Training")
# Fine-tuning the model
num_epochs = 3
for epoch in range(num_epochs):
    for batch in train_loader:
      try:
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        outputs = model(
            input_ids=input_ids,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()

        print(f"File {batch['file']} Loss : {loss.item()}")
        logging.debug(f"File {batch['file']} Loss : {loss.item()}")
      except Exception as e:
        logging.error(e)
      except:
        logging.error("Didn't catch that")

    print(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")
    logging.info(f"Epoch {epoch+1}/{num_epochs} | Loss: {loss.item()}")

    timestamp = time.time()
    model.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")
    t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model.{timestamp}")

logging.info("Saving")
# Save the model after training
model.save_pretrained(f"{modelStorePath}/fine_tuned_model")
t5model.tokenizer.save_pretrained(f"{modelStorePath}/fine_tuned_model")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.48k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/703k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/294k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/12.5k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/902 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.07427261024713516
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.014675409533083439
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.023393414914608
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.02324634976685047
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.04883069172501564
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.06446563452482224
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.022862732410430908
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.1108323335647583
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.04748929664492607
File ['/content/drive/MyDrive/NLP-Project/data/drawing.json'] Loss : 0.03626523166894913
File ['/content/drive/MyDrive/NLP-Project/data

ERROR:root:Didn't catch that


File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.03741416335105896
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.035471394658088684
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.11493727564811707
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.03782917186617851
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.2267647236585617
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.051949143409729004
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.016503777354955673
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.06898091733455658
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.10922448337078094
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.015458476729691029
File ['/content/drive/MyDrive/NLP-Project/data/linalg.json'] Loss : 0.0632893294095993
File ['/content/drive/MyDrive/

In [None]:
## Testing using examples

finalModel = T5Model_Pretrained(ModelStorePath + "/generation/fine_tuned_model.1714589427.6643722")

input_str = "@not_implemented_for('multigraph')\n@nx._dispatch\ndef peak_number(G, cores=None):\n    if nx.number_of_selfloops(G) > 0:\n        msg = 'Input graph has self loops which is not permitted; Consider using G.remove_edges_from(nx.selfloop_edges(G)).'\n        raise NetworkXError(msg)\n    if cores is None:\n        cores = nx.core_number(G)\n    H = G.copy()\n    peak_number = {}\n    while H.nodes():\n        core_degeneracy = nx.k_core(H, core_number=cores)\n        peak_number.update({node: cores[node] for node in core_degeneracy})\n        H.remove_nodes_from(core_degeneracy.nodes())\n        cores = nx.core_number(H)\n    return peak_number"
inputs = input_str.split("\n")
for i in range(1,len(inputs)):
  print(finalModel.predict("Code Gen: " + "\n".join(inputs[:i])))

def edges(G, k, d): 
def edges(G, u, v, k, k, k, k, k, k, d): 
    if G.is_directed(): 
        return 0 
        raise NetworkXError(msg) 
    return 0 
    return 0 
    return nx.utils.powerlaw_sum(G.edges, data='name')) 
    H.add_edge(0, 1, weight=1) 
    for u, v in edges: 
        peak_number = 0 
        if G.is_multigraph(): 
    return peak_number 
    return H 
    return (0, 1, 'edge1'), 'edge1', 'edge2', 'edge3', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'edge4', 'ed

In [None]:
# Attempt BLEU score

import json
import os
import sys

final_model = "/content/drive/MyDrive/NLP-Project/finetuned/fine_tuned_model.1702233587.7906475"

from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


model = T5Model_Pretrained(final_model)

dataset = []

jsons = [f"{DataStorePath}/{f}" for f in os.listdir(DataStorePath)]
for js in jsons:
    if js.endswith(".json"):
        with open(js) as f:
            data = json.load(f)
            for pair in data:
                dataset.append(pair)

size = len(dataset)
print(size)
inputs = [data["code"] for data in dataset]
expected_output = [data["docstring"].split() for data in dataset]
calculated_output = []
for i,s in enumerate(inputs):
  prediction = model.predict(s).split()
  print(f"{i}/{size} : {prediction[:6]}" )
  calculated_output.append(prediction)

bleu_score = corpus_bleu(expected_output, calculated_output, smoothing_function=SmoothingFunction().method1)

print(bleu_score)

2090
0/2090 : ['Preprocess', 'the', 'graph', 'to', 'add', 'edges']


Token indices sequence length is longer than the specified maximum sequence length for this model (629 > 512). Running this sequence through the model will result in indexing errors


1/2090 : ['Estimate', 'the', 'Fiedler', 'vector', 'using', 'the']
2/2090 : ['Find', 'the', 'Fiedler', 'vector', 'using', 'the']
3/2090 : ['Returns', 'a', 'function', 'that', 'solves', 'the']
4/2090 : ['Returns', 'the', 'algebraic', 'connectivity', 'of', 'an']
5/2090 : ['Find', 'the', 'Fiedler', 'vector.', 'The', 'Fiedler']
6/2090 : ['Compute', 'a', 'random', 'simple', 'ordering', 'of']
7/2090 : ['Fiedler', 'vector', 'to', 'bisection', 'graph.', 'This']
8/2090 : ['Make', 'X', 'orthogonal', 'to', 'the', 'nullspace']
9/2090 : ['Make', 'X', 'orthogonal', 'to', 'the', 'nullspace']
10/2090 : ['Returns', 'a', 'function', 'that', 'returns', 'a']
11/2090 : ['Returns', 'a', 'function', 'that', 'returns', 'a']
12/2090 : ['Returns', 'a', 'numpy', 'array', 'using', 'attributes']
13/2090 : ['Returns', 'a', 'SciPy', 'sparse', 'array', 'using']
14/2090 : ['Returns', 'the', 'Bethe', 'Hessian', 'matrix', 'of']
15/2090 : ['Returns', 'incidence', 'matrix', 'of', 'graph', 'G.']
16/2090 : ['Returns', 'adjac