In [1]:
from __future__ import absolute_import, division, print_function

import argparse
import glob
import logging
import os
import pickle
import random
import re
import shutil

import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler,TensorDataset
from torch.utils.data.distributed import DistributedSampler
import json


from tqdm import tqdm, trange
import multiprocessing
from model import Model
from transformers import (WEIGHTS_NAME, AdamW, get_linear_schedule_with_warmup,
                          RobertaConfig, RobertaForSequenceClassification, RobertaTokenizer)

In [2]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/codebert-base", cache_dir='./')

In [4]:
with open("../dataset/valid_vdo.jsonl") as f:
    for line in f:
        js=json.loads(line.strip())
        print(js['code'])
        break

diff --git a/token-metadata/0xc3761EB917CD790B30dAD99f6Cc5b4Ff93C4F9eA/metadata.json b/token-metadata/0xc3761EB917CD790B30dAD99f6Cc5b4Ff93C4F9eA/metadata.json "symbol": "ERC20",
"address": "0xc3761EB917CD790B30dAD99f6Cc5b4Ff93C4F9eA",
"decimals": 18,
- "dharmaVerificationStatus": {
"dharmaVerificationStatus": "VERIFIED"
}
\ No newline at end of file
-}
\ No newline at end of file



In [9]:
code=' '.join("diff --git a/token-metadata/0xc3761EB917CD790B30dAD99f6Cc5b4Ff93C4F9eA/metadata.json b/token-metadata/0xc3761EB917CD790B30dAD99f6Cc5b4Ff93C4F9eA/metadata.json".split())

In [10]:
len(code)

158

In [11]:
len(tokenizer.tokenize(code))

83

In [14]:
code_tokens = tokenizer.tokenize(code)[:50-2]

In [15]:
source_tokens =[tokenizer.cls_token]+code_tokens+[tokenizer.sep_token]

In [16]:
len(source_tokens)

50

## preprocess jsonl file

In [5]:
import json
import csv

# Define the input and output file paths
input_file = '../dataset/valid_vdo.jsonl'
output_file = '../dataset/valid_vdo.csv'

# Open the input JSONL file and the output CSV file
with open(input_file, 'r', encoding='UTF-8') as jsonl_file, open(output_file, 'w', newline='', encoding='UTF-8') as csv_file:
    # Create a CSV writer object
    csv_writer = csv.writer(csv_file)
    
    # Write the header row to the CSV file
    csv_writer.writerow(['code', 'label'])
    
    # Read each line from the JSONL file
    for line in jsonl_file:
        # Parse the JSON data
        json_data = json.loads(line.strip())
        
        # Extract the 'code' and 'label' fields
        code = json_data.get('code', '')
        label = json_data.get('label', '')
        
        # Write the data to the CSV file
        csv_writer.writerow([code, label])

print(f"Data has been successfully written to {output_file}")

Data has been successfully written to ../dataset/valid_vdo.csv


In [2]:
import pandas as pd
csv_file_path = '../dataset/train_vdo.csv'
df = pd.read_csv(csv_file_path)

In [3]:
df.iloc[0]

code     diff --git a/nerdamer.core.js b/nerdamer.core....
label                                                    0
Name: 0, dtype: object

In [5]:
from datasets import load_dataset
from transformers import AutoTokenizer, GPTNeoForSequenceClassification
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification
from torch.optim import AdamW
from transformers import get_scheduler
import torch
from tqdm.auto import tqdm
import evaluate

# Load dataset
dataset = load_dataset("tyfann/vdo_format_classify")
tokenizer = AutoTokenizer.from_pretrained("bigcode/tiny_starcoder_py", cache_dir='./')

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(examples):
    return tokenizer(examples["code"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data precessing
tokenized_datasets = tokenized_datasets.remove_columns(["code"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")

# small_train_dataset = tokenized_datasets["train"].shuffle(seed=42).select(range(1000))
# small_eval_dataset = tokenized_datasets["test"].shuffle(seed=42).select(range(1000))


#Create Data Loader objects
train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets["test"], batch_size=8)

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

In [4]:
# Initialize the model
model = AutoModelForSequenceClassification.from_pretrained("./finetuned_model", num_labels=15)

In [7]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

GPTBigCodeForSequenceClassification(
  (transformer): GPTBigCodeModel(
    (wte): Embedding(49153, 768)
    (wpe): Embedding(8192, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-19): 20 x GPTBigCodeBlock(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPTBigCodeSdpaAttention(
          (c_attn): Linear(in_features=768, out_features=896, bias=True)
          (c_proj): Linear(in_features=768, out_features=768, bias=True)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPTBigCodeMLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=True)
          (c_proj): Linear(in_features=3072, out_features=768, bias=True)
          (act): PytorchGELUTanh()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,),

In [8]:
metric = evaluate.load("accuracy")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

  sdpa_result = torch.nn.functional.scaled_dot_product_attention(


{'accuracy': 0.2540160642570281}

In [30]:

tokenizer.add_special_tokens({'pad_token': '[PAD]'})
def tokenize_function(examples):
    return tokenizer(examples["code"], padding="max_length", truncation=True, max_length=512)

tokenized_datasets = dataset.map(tokenize_function, batched=True)

# Data precessing
# tokenized_datasets = tokenized_datasets.remove_columns(["text"])

dataset record example: 7


Map:   0%|          | 0/27530 [00:00<?, ? examples/s]

Map:   0%|          | 0/3065 [00:00<?, ? examples/s]

Map:   0%|          | 0/1992 [00:00<?, ? examples/s]

In [31]:
tokenized_datasets = tokenized_datasets.remove_columns(["code"])

In [37]:
tokenizer.pad_token_id

49152

In [34]:
len(tokenized_datasets['train'][1]['input_ids'])

512