In [8]:
import json
import os
import numpy as np
import regex as re
import pandas as pd

In [9]:
np.random.seed(0)

In [10]:
train_raw = json.load(open('train_ood_codesearchnet.json', 'r'))
print(json.dumps(train_raw[100], indent=4))

{
    "idx": 101,
    "doc": "Determines if an option was set or not .",
    "code": "public boolean isSet ( @ NonNull NamedOption option ) { if ( option . isBoolean ( ) ) { return option . getValue ( ) != null && option . < Boolean > getValue ( ) ; } return option . getValue ( ) != null ; }",
    "raw": "public boolean isSet(@NonNull NamedOption option) {\n      if (option.isBoolean()) {\n         return option.getValue() != null && option.<Boolean>getValue();\n      }\n\n      return option.getValue() != null;\n   }",
    "label": 1
}


In [11]:
from collections import Counter
from tqdm import tqdm
from bpe import BpeVocabulary

code_vocab_counter = Counter()
text_vocab_counter = Counter()

train_id = json.load(open('../java/train_codesearchnet_0.json', 'r'))

for row in tqdm(train_id):
    code = row['code'].split()
    text = row['doc'].split()
    code_vocab_counter.update(code)
    text_vocab_counter.update(text)

code_bpe_vocab = BpeVocabulary(vocab_size=10000, pct_bpe=0.5)
text_bpe_vocab = BpeVocabulary(vocab_size=10000, pct_bpe=0.5)

100%|██████████| 164923/164923 [00:02<00:00, 59196.12it/s]


In [12]:
code_bpe_vocab.fit(code_vocab_counter)
text_bpe_vocab.fit(text_vocab_counter)

100%|██████████| 646423/646423 [00:48<00:00, 13424.66it/s]


5000
5000


100%|██████████| 48387/48387 [00:02<00:00, 22741.81it/s]


5000
5000


In [13]:
import spacy
nlp = spacy.load('en_core_web_sm')

from langdetect import detect


In [14]:
# tokenize
def tokenize(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

# check if text contains special tokens like urls, emails, etc.
def contains_special_token(text):
    doc = nlp(text)
    for token in doc:
        if token.like_url or token.like_email:
            return True
    return False

In [15]:
train = []

for i, row in enumerate(tqdm(train_raw)):
    train_dict = {}
    train_dict['id'] = 'train-java-ood-{}'.format(row['idx'])
    train_dict['code'] = row['code']
    train_dict['text'] = row['doc']
    train_dict['label'] = row['label']
    train_dict['raw'] = row['raw']

    if len(tokenize(train_dict['text'])) <= 3 or contains_special_token(train_dict['text']):
        continue

    try:
        lang = detect(train_dict['text'])
        if lang != 'en':
            continue
    except:
        continue
    
    train_dict['code_bpe'] = ' '.join(code_bpe_vocab.tokenize(row['code'].split()))
    train_dict['text_bpe'] = ' '.join(text_bpe_vocab.tokenize(row['doc'].split()))
    
    train.append(train_dict)

100%|██████████| 289528/289528 [1:12:11<00:00, 66.84it/s] 


In [16]:
print(len(train))

210936


In [17]:
test_raw = json.load(open('valid_ood_codesearchnet.json', 'r'))
print(json.dumps(test_raw[0], indent=4))

{
    "idx": 1,
    "doc": "Mirrors the one ObservableSource in an Iterable of several ObservableSources that first either emits an item or sends a termination notification . <p > <img width = 640 height = 385 src = https : // raw . github . com / wiki / ReactiveX / RxJava / images / rx - operators / amb . png alt = > <dl > <dt > <b > Scheduler : < / b > < / dt > <dd > { @code amb } does not operate by default on a particular { @link Scheduler } . < / dd > < / dl >",
    "code": "@ CheckReturnValue @ NonNull @ SchedulerSupport ( SchedulerSupport . NONE ) public static < T > Observable < T > amb ( Iterable < ? extends ObservableSource < ? extends T > > sources ) { ObjectHelper . requireNonNull ( sources , \"sources is null\" ) ; return RxJavaPlugins . onAssembly ( new ObservableAmb < T > ( null , sources ) ) ; }",
    "raw": "@CheckReturnValue\n    @NonNull\n    @SchedulerSupport(SchedulerSupport.NONE)\n    public static <T> Observable<T> amb(Iterable<? extends ObservableSource<? extend

In [18]:
test = []

for row in test_raw:
    test_dict = {}
    test_dict['id'] = 'test-java-ood-{}'.format(row['idx'])
    test_dict['code'] = row['code']
    test_dict['text'] = row['doc']
    test_dict['label'] = row['label']
    test_dict['raw'] = row['raw']
    test_dict['code_bpe'] = ' '.join(code_bpe_vocab.tokenize(row['code'].split()))
    test_dict['text_bpe'] = ' '.join(text_bpe_vocab.tokenize(row['doc'].split()))

    if len(tokenize(train_dict['text'])) <= 3 or contains_special_token(train_dict['text']):
        continue

    try:
        lang = detect(train_dict['text'])
        if lang != 'en':
            continue
    except:
        continue

    test.append(test_dict)

In [19]:
np.random.shuffle(train)
np.random.shuffle(test)

valid_size = int(len(train) * 0.1)
valid = train[:valid_size]
train = train[valid_size:]

In [20]:
print(len(train))
print(len(valid))
print(len(test))

189843
21093
15954


In [21]:
with open('data_train.json', 'w') as f:
    json.dump(train, f, indent=2)

with open('data_valid.json', 'w') as f:
    json.dump(valid, f, indent=2)

with open('data_test.json', 'w') as f:
    json.dump(test, f, indent=2)


In [22]:
print(json.dumps(valid[:2], indent=4))

[
    {
        "id": "train-java-ood-275589",
        "code": "private void computeSpan ( List < AssociatedPair > points ) { Q . reshape ( points . size ( ) , 9 ) ; int index = 0 ; for ( int i = 0 ; i < points . size ( ) ; i ++ ) { AssociatedPair p = points . get ( i ) ; Point2D_F64 a = p . p2 ; Point2D_F64 b = p . p1 ; // The points are assumed to be in homogeneous coordinates.  This means z = 1 Q . data [ index ++ ] = a . x * b . x ; Q . data [ index ++ ] = a . x * b . y ; Q . data [ index ++ ] = a . x ; Q . data [ index ++ ] = a . y * b . x ; Q . data [ index ++ ] = a . y * b . y ; Q . data [ index ++ ] = a . y ; Q . data [ index ++ ] = b . x ; Q . data [ index ++ ] = b . y ; Q . data [ index ++ ] = 1 ; } if ( ! solverNull . process ( Q , 4 , nullspace ) ) throw new RuntimeException ( \"Nullspace solver should never fail, probably bad input\" ) ; // extract the span of solutions for E from the null space for ( int i = 0 ; i < 9 ; i ++ ) { X [ i ] = nullspace . unsafe_get ( i , 0 ) 