In [1]:
#preprocessing for training pipeline
import numpy as np
import pandas as pd
from transformers import *
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from datasets import Dataset
import pickle
from utils.helper import read_py150k_code, read_file_to_string
import regex as re



In [2]:
target_features = [
     'snake_case_var_ratio',
     'snake_case_class_ratio',
     'snake_case_method_ratio',
     'upper_camel_case_var_ratio',
     'upper_camel_case_class_ratio',
     'upper_camel_case_method_ratio',
     'lower_camel_case_var_ratio',
     'lower_camel_case_class_ratio',
     'lower_camel_case_method_ratio',
     'func_decorators_avg',
     'class_decorators_avg',
     'class_parents_avg',
     'comprehensions_avg',
     'generators_avg',
     'lambda_avg',
     'comment_density',
     'ds_density',
]
PY150K_DIR = "data/py150"
PY150K_CODE_DIR = "data/py150/py150_files"
PY150K_TRAIN_AST = "data/py150/python100k_train.json"
PY150K_EVAL_AST = "data/py150/python50k_eval.json"
PY150K_TRAIN_CODE = "data/py150/py150_files/python100k_train.txt"
PY150K_EVAL_CODE = "data/py150/py150_files/python50k_eval.txt"

In [3]:
py150k_df = pd.read_csv("data/py150k_metric_20220524.csv")
bigquery_df = pd.read_csv("data/bigquery_metric_20220526.csv")
bq_content_df = pd.read_csv(f'data/BigQuery/files/cubert_metadata000000000000') 

combined_df = pd.concat([py150k_df, bigquery_df], axis = 0) 
combined_df['file'] = [ x + y  for x, y in zip(combined_df['repository'], combined_df['filepath'])]
bq_content_df['file'] = [ x + y  for x, y in zip(bq_content_df['repository'], bq_content_df['filepath'])]

In [4]:
with open("data/combined_dataset/clusters/feature_set_1/full_feature_clusterer.pickle", "rb") as file:
    cluster_pred = pickle.load(file)
    labels = cluster_pred.labels_
    cluster_num = len(np.unique(labels))
combined_df['labels'] = labels

In [5]:
#get code


code_filenames = read_py150k_code(PY150K_TRAIN_CODE)
#print ((code_filenames[1]))
py150_code = []
for i in range(0, len(code_filenames)):
    #print (i)
    try:
        py150_code.append( read_file_to_string( #regex codefilenames drop data/py150/py150_files/data/ 
            f"{PY150K_CODE_DIR}/{code_filenames[i]}"
        ))
    except: # currently appending empty string for empy files
        print (f"{PY150K_CODE_DIR}/{code_filenames[i]}")
        py150_code.append( "File Error"
        )
ex_files = list(combined_df['file'])
filtered_bq = bq_content_df[bq_content_df['file'].isin(ex_files)]
bigquery_code = list(filtered_bq['content'])

data/py150/py150_files/data/PaulSec/twittor/implant.py
data/py150/py150_files/data/alexandrebarachant/Grasp-and-lift-EEG-challenge/preprocessing/aux.py
data/py150/py150_files/data/pimutils/khal/khal/khalendar/aux.py
data/py150/py150_files/data/Akagi201/learning-python/socket/backdoor_fud/backdoor_fud.py
data/py150/py150_files/data/ummahusla/Codecademy-Exercise-Answers/Language Skills/Python/Unit 6/Student Becomes the Teacher/Just Average/9-How is everything doing?.py
data/py150/py150_files/data/ummahusla/Codecademy-Exercise-Answers/Language Skills/Python/Unit 7/2-Battleship!/Hit or Miss?/12-Bad Aim.py
data/py150/py150_files/data/pimutils/khal/tests/aux.py
data/py150/py150_files/data/ummahusla/Codecademy-Exercise-Answers/Language Skills/Python/Unit 7/2-Battleship!/Hit or Miss?/10-You Win!.py


In [6]:
#hugging face dataset

print (len(filtered_bq))
combined_code = py150_code + bigquery_code

combined_df['content'] = combined_code
display(combined_df)

115050


Unnamed: 0,line_count,comment_count,comment_total_len,comment_avg_len,comment_density,id_total,lower_case,id_total_var,lower_case_var,snake_case_ratio,...,repository,filepath,forks,issue_events,stars,parse_error,func_async_count,file,labels,content
0,116.0,2.0,44.0,22.000000,0.017241,1.0,1.0,1.0,1.0,0.000000,...,,,,,,,,,19,#!/usr/bin/env python\n# -*- coding: utf-8 -*-...
1,363.0,24.0,1244.0,51.833333,0.066116,381.0,214.0,333.0,195.0,0.341207,...,,,,,,,,,-1,# -*- coding: utf-8 -*-\n# Open Source Initiat...
2,13.0,1.0,21.0,21.000000,0.076923,3.0,3.0,3.0,3.0,0.000000,...,,,,,,,,,19,"#!/usr/bin/env python\n""""""Django's command lin..."
3,34.0,0.0,0.0,0.000000,0.000000,26.0,18.0,26.0,18.0,0.307692,...,,,,,,,,,16,"""""""Installer for hippybot\n""""""\n\nimport os\nc..."
4,11.0,1.0,21.0,21.000000,0.090909,3.0,3.0,3.0,3.0,0.000000,...,,,,,,,,,19,#!/usr/bin/env python\nimport os\nimport sys\n...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
115045,1292.0,201.0,7829.0,38.950249,0.155573,1658.0,1533.0,1646.0,1527.0,0.056695,...,oscarbranson/latools,latools/helpers/plot.py,11.0,77.0,9.0,1.0,,oscarbranson/latoolslatools/helpers/plot.py,-1,"""""""\nPlotting functions.\n\n(c) Oscar Branson ..."
115046,194.0,17.0,633.0,37.235294,0.087629,63.0,42.0,57.0,41.0,0.126984,...,erjac77/ansible-module-f5bigip,library/f5bigip_ltm_monitor_snmp_dca.py,5.0,72.0,6.0,1.0,,erjac77/ansible-module-f5bigiplibrary/f5bigip_...,-1,#!/usr/bin/python\n# -*- coding: utf-8 -*-\n#\...
115047,145.0,1.0,23.0,23.000000,0.006897,111.0,76.0,99.0,76.0,0.270270,...,python-hyper/hyper-h2,test/test_config.py,139.0,376.0,754.0,1.0,,python-hyper/hyper-h2test/test_config.py,-1,"# -*- coding: utf-8 -*-\n""""""\ntest_config\n~~~..."
115048,48.0,0.0,,0.000000,0.000000,53.0,44.0,47.0,44.0,0.094340,...,techbureau/zaifbot,tests/trade/test_trade.py,14.0,198.0,42.0,1.0,,techbureau/zaifbottests/trade/test_trade.py,-1,import unittest\nfrom zaifbot.trade.trade impo...


In [7]:
#df to dataset
dataset = Dataset.from_pandas(combined_df).train_test_split(test_size=0.2)



In [8]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration, PLBartTokenizer

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')

tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="python" )
#return_tensors ='pt'


loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/vocab.json from cache at C:\Users\km201/.cache\huggingface\transformers\1e2aacf615bc83f25a9d748eccb762b335eee01a29ab7a8db9b8e86cc851d489.9a48c5abf25554713c6513ab01066e53569b9a2da0d6189715951cf7c6288805
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/merges.txt from cache at C:\Users\km201/.cache\huggingface\transformers\7eaa9b856402f05e8fdd452951872ecd3c2692ea9abb86b7ab62b07e3bc5f7de.7179059568f1a130b0a79e4bac71f38545207cab0ec45ce82ca09afadb2649a3
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/added_tokens.json from cache at C:\Users\km201/.cache\huggingface\transformers\a3e93db547e41cdd21f01826d07c5679e111b02d8e969c607611c30a6acbe191.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/special_tokens_map.json from cache at C:\Users\km201/.cache\huggingface\transformers\5941df5e4315c5

In [9]:

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
def tokenization(example):
    #print (tokenizer(example["content"], padding='max_length', truncation=True).keys())
    return tokenizer(example["content"], padding='max_length', truncation=True)
train_dataset = dataset["train"].map(tokenization, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataset.format['type']

test_dataset = dataset["test"].map(tokenization, batched=True)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.format['type']



loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/vocab.json from cache at C:\Users\km201/.cache\huggingface\transformers\1e2aacf615bc83f25a9d748eccb762b335eee01a29ab7a8db9b8e86cc851d489.9a48c5abf25554713c6513ab01066e53569b9a2da0d6189715951cf7c6288805
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/merges.txt from cache at C:\Users\km201/.cache\huggingface\transformers\7eaa9b856402f05e8fdd452951872ecd3c2692ea9abb86b7ab62b07e3bc5f7de.7179059568f1a130b0a79e4bac71f38545207cab0ec45ce82ca09afadb2649a3
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/added_tokens.json from cache at C:\Users\km201/.cache\huggingface\transformers\a3e93db547e41cdd21f01826d07c5679e111b02d8e969c607611c30a6acbe191.5cc6e825eb228a7a5cfd27cb4d7151e97a79fb962b31aaf1813aa102e746584b
loading file https://huggingface.co/Salesforce/codet5-base/resolve/main/special_tokens_map.json from cache at C:\Users\km201/.cache\huggingface\transformers\5941df5e4315c5

'torch'

In [10]:
print ((train_dataset[0]))
train_dataset.save_to_disk("datasets/codet5_train.hf")
test_dataset.save_to_disk("datasets/codet5_test.hf")

{'labels': tensor(20), 'input_ids': tensor([    1,     7, 14848,    30,  7718,    17,    28,   203,   203,  8395,
          203,   565, 24314, 10211,  8392,  1491,   203,   203,   565,   411,
           84,    34,  1986, 24314, 10211,  8392,  1491, 19808,  1846,   358,
        16592,   598, 24314, 10211,  7084,  1450,  4529,  8392,  1491,  8513,
           18,  4554,   848,   999,   326,  8392,  1491,   358, 18472,   340,
         2975,   715,  7120,  5295,  4123,   487,  6635,  1047,  1998,  5550,
        17347,    84,  4438,    84,    34,  9434,  1846,  1221,  8392,  1491,
         4097,  8220,   326, 24314, 10211,  8392,  1491,  7323,  1846,  1297,
          527,   326,  1446,  1239,  6063,    30, 25892,   264,   473,  5618,
        23480,  5618,    31,  2557,    17,  8412,    10,  4521, 23480,  4521,
           31,  2412,   358,  3433,  2239,  3285, 17347,    84,    34,   225,
          468,  8054,    30,   512,  9172,   203,   203,   565,  3502,  2557,
          857,  1177,    30,

In [11]:
tokenizer = PLBartTokenizer.from_pretrained("uclanlp/plbart-python-en_XX", src_lang="python", tgt_lang="python" )

train_dataset = dataset["train"].map(tokenization, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
train_dataset.format['type']

test_dataset = dataset["test"].map(tokenization, batched=True)
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
test_dataset.format['type']

loading file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/sentencepiece.bpe.model from cache at C:\Users\km201/.cache\huggingface\transformers\e57de2ba12d2b1d3cae7ce5921704890ac50789e8eb95100ff4c64dc98559729.c65001d1986897f4ae6d41d2c49f0e1621d3518cab63e0ffa3005e5deb5aae40
loading file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/tokenizer_config.json from cache at None
loading configuration file https://huggingface.co/uclanlp/plbart-python-en_XX/resolve/main/config.json from cache at C:\Users\km201/.cache\huggingface\transformers\68f40642f8534e3482166065ce817305c07e1f4b8ea96013fe62ab865088bddb.8929a51af95d04be

'torch'

In [12]:
print ((train_dataset[0]))
train_dataset.save_to_disk("datasets/plbart_train.hf")
test_dataset.save_to_disk("datasets/plbart_test.hf")

{'labels': tensor(20), 'input_ids': tensor([  754,  6971, 33475,  ...,     1,     1,     1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


In [18]:
from datasets import load_from_disk
#example method for loading datset checkpoints
test = load_from_disk('datasets/plbart_test.hf')
test.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print ((test[0]))

{'labels': tensor(-1), 'input_ids': tensor([662, 763, 662,  ...,   1,   1,   1]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}
