In [1]:
from pathlib import Path
dataset = Path('dataset').absolute()

In [2]:
import os
os.chdir(dataset)

In [71]:
from pathlib import Path

js_dir = dataset / 'javascript'
new_setup = not js_dir.exists()
if new_setup:
    ! wget -O javascript.zip https://huggingface.co/datasets/code_search_net/resolve/main/data/javascript.zip?download=true
    ! unzip javascript.zip
train = js_dir / 'final' / 'jsonl' / 'train'
os.chdir(train)
if new_setup:
    !gzip -d *.gz

In [6]:
import sqlite3, json
from contextlib import contextmanager

colstr = 'path, func_name, code, docstring, url'
columns = colstr.split(', ')

def process_line(line, cursor):
    obj = json.loads(line)
    places = ','.join(['?' for _ in columns])
    cursor.execute(f'INSERT INTO codes ({colstr}) VALUES ({places})', [obj[col] for col in columns])


def process_file(jsonl_file, cursor):
    with jsonl_file.open('r') as file:
        for line in file:
            process_line(line, cursor)

def create_table(cursor):
    cursor.execute('DROP TABLE if exists codes')
    col_defs = ', '.join(f'{col} TEXT' for col in columns)
    cursor.execute(f'''CREATE TABLE codes 
                      (ID INTEGER PRIMARY KEY, {col_defs})''')

def process_directory(jsonl_dir, cursor):
    for jsonl_file in jsonl_dir.glob('*.jsonl'):
        process_file(jsonl_file, cursor)

@contextmanager
def get_cursor(database_name='rsn_train'):
    with sqlite3.connect(database_name) as conn:
        yield conn.cursor()


In [7]:
with get_cursor() as cursor:
    create_table(cursor)
    process_directory(train, cursor)
    print(next(cursor.execute('select count(*) from codes')))

(123889,)


In [8]:
with get_cursor() as cursor:
    cursor.execute('DROP TABLE if exists shuffled')
    cursor.execute('create table if not exists shuffled as select * from codes ORDER BY RANDOM()')
    print(next(cursor.execute('select count(*) from shuffled')))
    print(list(cursor.execute('select ID from shuffled limit 10')))

(123889,)
[(71109,), (81773,), (32678,), (117619,), (107510,), (84089,), (118147,), (78091,), (74175,), (46040,)]


In [83]:
openai_instuctions = '''We are generating a dataset to train a model in detecting SRP violations in the level of functions.
Each output should be a list of 10 functions in javascript annotated as responsability sections.
Use the message as source of entropy to ensure variability in your answers.
The dataset needs to be diversified.
The generated examples should be as realistic as possible, use parts of user input as source of this realistic aspect, as they come from real users code.

If the function respects SRP, then it should in this form, note there is only one tag meaning only one responsability: 
<userRegistration>
function userRegistration(username, password) { 
  const userId = createUser(username, password); 
  logActivity(userId, "User registration"); 
} 
</userRegistration>
we can't account a simple call like this: 
const userId = createUser(username, password);
as a different responsability because responsability is about the implementation not the call.
this function follows the SRP principle and each instruction has not been considered a different responsibility because it is just a function call, and this call is a part of the unique responsibility: the flow of userRegistration. This is why the function has a single responsibility.
Here is an other example of a function that respects SRP :
<handleSubmitRequest>
  function validateAndSubmitForm(form) {
    if (validateFormData(form)) {
      submitForm(form);
    }
  }
</handleSubmitRequest>

Now if the function is not SRP then it should be in this form: 
<emailActiveClients>
function emailClients(clients) { 
  clients.forEach(client => {
    <isActiveClient>
    const clientRecord = database.lookup(client);
    if (clientRecord.isActive()) </isActiveClient> {
      email(client); 
    }
  });
}</emailActiveClients>
Note that isActiveClient region denotes a separate responsability that should be externalized to its own function.
Make each one of the 10 functions a separate snippet and don't add code explanation or other text, keep the example as if they were really extracted from code, just add the annotations as explained
Try to be tolerant in SRP, don't over split functions, if a function can be viewed as SRP than keep it in one segment
Pay attention to the structure of tags, each tag needs to have appropriate closing.
Separate the functions by line breaks.
'''

In [108]:
import re

def strip_js_comments(js_code):
    # Remove all occurrences of single line comments
    js_code = re.sub(r'\n?//.*?\n', '\n', js_code)

    # Remove all occurrences of multi-line comments
    js_code = re.sub(r'/\*.*?\*/', '', js_code, flags=re.DOTALL)

    return js_code

def request(index, limit = 10):
    with get_cursor() as cursor:
        codes = list(cursor.execute('select code from shuffled limit ? offset ?', (limit, limit * index + 1)))
        codestr = '\n\n'.join(strip_js_comments(code[0]) for code in codes)
        return f'Annotate these {limit} functions:\n\n{codestr}'


In [None]:
!pip install openai

In [37]:
import openai

def read_api_key(filepath):
    with open(filepath, 'r') as file:
        return file.read().strip()

client = openai.OpenAI(api_key=read_api_key(dataset/'key'))


In [65]:
def submit_message(assistant_id, thread, user_message):
    client.beta.threads.messages.create(
        thread_id=thread.id, role="user", content=user_message
    )
    return client.beta.threads.runs.create(
        thread_id=thread.id,
        assistant_id=assistant_id,
    )


In [85]:
threads = [thread]

In [96]:
thread = client.beta.threads.create()
threads.append(thread)

In [66]:
import time

def wait_on_run(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(0.5)
    return run


In [171]:
assistant_id = "asst_5LjvuX6FRcDwdOMg2H0lzLHu"
def get_messages(i): 
    user_input = request(i)
    while True:
        print(time.ctime())
        thread = client.beta.threads.create()
        threads.append(thread)
        run = submit_message(assistant_id, thread, user_input)
        run = wait_on_run(run, thread)
        if run.status != "failed":
            break
        error = run.last_error.message
        pattern = r"Please try again in ((?:(\d+)m)?(?:(\d*\.?\d*)s)?)"
        match = re.search(pattern, error)
        delay = sum(float(match.group(idx) or '0') * (60 ** k) for _, idx, k in [('m', 2, 1), ('s', 3, 0)])
        print(f'sleeping {match.group(1)}')
        time.sleep(delay)
    return client.beta.threads.messages.list(thread_id=thread.id, order="desc")


In [137]:
def write_messages(filename, messages):
    with open(js_dir / filename, 'w') as file:
        for message in [m for m in messages.data if m.role == "assistant"]:
            file.write(message.content[0].text.value)


In [138]:
def handle_index(i):
    print(f'requesting {i} ...')
    messages = get_messages(i)
    print(f'received {i}')
    write_messages(f'thread2-{i}.txt', messages)

In [128]:
for i in range(30, 100):
    handle_index(i)

requesting 30 ...
received 30
requesting 31 ...
received 31
requesting 32 ...
received 32
requesting 33 ...
received 33
requesting 34 ...
received 34
requesting 35 ...
received 35
requesting 36 ...
received 36
requesting 37 ...
received 37
requesting 38 ...
received 38
requesting 39 ...
received 39
requesting 40 ...
received 40
requesting 41 ...
received 41
requesting 42 ...
received 42
requesting 43 ...
received 43
requesting 44 ...
received 44
requesting 45 ...
received 45
requesting 46 ...
received 46
requesting 47 ...
received 47
requesting 48 ...
received 48
requesting 49 ...
received 49
requesting 50 ...
received 50
requesting 51 ...
received 51
requesting 52 ...
received 52
requesting 53 ...
received 53
requesting 54 ...
received 54
requesting 55 ...
received 55
requesting 56 ...
received 56
requesting 57 ...
received 57
requesting 58 ...
received 58
requesting 59 ...
received 59
requesting 60 ...
received 60
requesting 61 ...
received 61
requesting 62 ...
received 62
requesting

KeyboardInterrupt: 

In [None]:
for i in range(209, 300):
    try:
        handle_index(i)
    except KeyboardInterrupt:
        raise
    except:
        print(f'Error happened when processing {i}')
        raise

requesting 209 ...
Thu Jan 11 10:09:00 2024
received 209
requesting 210 ...
Thu Jan 11 10:10:56 2024
received 210
requesting 211 ...
Thu Jan 11 10:13:18 2024
received 211
requesting 212 ...
Thu Jan 11 10:14:38 2024
sleeping 3m40.492s
Thu Jan 11 10:18:25 2024
received 212
requesting 213 ...
Thu Jan 11 10:21:20 2024
sleeping 8m45.657s
Thu Jan 11 10:30:13 2024
received 213
requesting 214 ...
Thu Jan 11 10:31:59 2024
sleeping 9m54.259s
Thu Jan 11 10:42:00 2024
received 214
requesting 215 ...
Thu Jan 11 10:43:47 2024
sleeping 11m31.027s
Thu Jan 11 10:55:24 2024
received 215
requesting 216 ...
Thu Jan 11 10:57:19 2024
sleeping 7m14.764s
Thu Jan 11 11:04:42 2024
received 216
requesting 217 ...
Thu Jan 11 11:05:40 2024
sleeping 6m18.604s
Thu Jan 11 11:12:06 2024
received 217
requesting 218 ...
Thu Jan 11 11:13:23 2024
sleeping 7m58.137s
Thu Jan 11 11:21:30 2024
received 218
requesting 219 ...
Thu Jan 11 11:23:01 2024
sleeping 6m18.604s
Thu Jan 11 11:29:27 2024
received 219
requesting 220 ...
T

In [166]:
0

0

SyntaxError: unexpected character after line continuation character (970412046.py, line 1)

In [132]:
list(m.role for m in messages.data)

['user', 'user', 'user', 'user', 'user', 'user', 'user', 'user', 'user']

In [94]:
request(12)

'Annotate these 10 functions:\n\nfunction(gt, or) {\n\n\t\tvar or1 = or.values[0],\n\t\t\tor2 = or.values[1];\n\t\tvar difference1 = set.difference(gt, or1);\n\t\tvar difference2 = set.difference(gt, or2);\n\t\treturn set.intersection(difference1, difference2);\n\t}\n\nfunction Session(suite) {\n\tEmitter.call(this);\n\n\t\n\tthis.files = [];\n\n\t\n\tthis.suite = suite;\n\n\t\n\tthis.global = global;\n\n\t\n\tthis.context = new Context(this);\n\n\t\n\tthis.engine = new Mocha({\n\t\t\n\t\t\n\t\tignoreLeaks: true,\n\t\tuseColors: true,\n\t\t\n\t\ttimeout: suite.options.timeout || 10000,\n\t\t\n\t\treporter: suite.options.reporter\n\t});\n\n\t\n\t\n\t\n\n\t\n\tthis.engine.suite.emit(\'pre-require\', this.global, null, this.engine);\n\n\t\n\t\n\t\n\t\n\t\n\t\n}\n\nfunction configurePaths(cfg) {\n\tvar sourcesBasePath = path.resolve(appPath, cfg.get(\'sourcesBasePath\'))\n\t   ,sources = cfg.get(\'sources\')\n\t   ,build = cfg.get(\'build\')\n\t   ,buildBaseUri\n\t   ,buildDir = nodeEnv ==

In [95]:
messages = get_messages(i)
messages

BadRequestError: Error code: 400 - {'error': {'message': "Can't add messages to thread_KMt8ytI3bKOUgRxeBdb0JZzS while a run run_Evr42VuchrmsIfE7rS0SuHbT is active.", 'type': 'invalid_request_error', 'param': None, 'code': None}}