In [44]:
import json
import os

def split_json_file(input_file, output_dir, chunk_size):
    # Ensure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    with open(input_file, 'r') as file:
        chunk_number = 1
        while True:
            # Read lines in chunks
            lines = [file.readline() for _ in range(chunk_size)]
            if not any(lines):
                break  # Exit loop if no more lines

            # Filter out empty lines
            lines = [line for line in lines if line.strip()]

            # Parse each line as JSON and write to a new file
            data = [json.loads(line) for line in lines]
            output_file = os.path.join(output_dir, f'chunk_{chunk_number}.json')
            with open(output_file, 'w') as out_file:
                json.dump(data, out_file, indent=4)
            
            chunk_number += 1

# Example usag
input_file = 'function_data.json'
output_dir = 'output_chunks'
chunk_size = 100  # Adjust the chunk size as needed

split_json_file(input_file, output_dir, chunk_size)

In [72]:
import json
import tensorflow as tf
from transformers import RobertaTokenizer
import numpy as np

# Initialize the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def read_functions_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    content = content.strip()
    
    if content.startswith('[') and content.endswith(']'):
        try:
            data = json.loads(content)
            for item in data:
                yield item
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON array: {e}")
    else:
        for line in content.split('\n'):
            line = line.strip()
            if line:
                try:
                    yield json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON object: {e}")
                    print(f"Problematic line: {line}")
                    continue

def preprocess_function(function_text, max_seq_length=512):
    lines = function_text.split('\n')
    declaration = lines[0]
    body = '\n'.join(lines[1:])
    
    func_name = declaration.split('(')[0].split()[-1]
    
    tokens = tokenizer.encode(body, add_special_tokens=True)
    
    sequences = [tokens[i:i+max_seq_length] for i in range(0, len(tokens), max_seq_length)]
    if len(sequences[-1]) < max_seq_length:
        sequences[-1] = sequences[-1] + [tokenizer.pad_token_id] * (max_seq_length - len(sequences[-1]))
    
    return np.array(sequences, dtype=np.int32), func_name

def create_dataset(json_file_path, batch_size=1):
    def gen():
        for func_obj in read_functions_from_json(json_file_path):
            function_text = func_obj.get('func')
            if function_text is None:
                print(f"Skipping object, no function text found: {func_obj}")
                continue
            sequences, name = preprocess_function(function_text)
            if sequences is None or name is None:
                print(f"Skipping, preprocessed data is None for: {function_text}")
                continue
            yield tf.cast(sequences, tf.int32), tf.cast(tokenizer.encode(name, add_special_tokens=True), tf.int32)
    
    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(None, 512), dtype=tf.int32),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        )
    ).padded_batch(batch_size)

# Create the dataset
json_file_path = 'output_chunks/chunk_1.json'
dataset = create_dataset(json_file_path)

# Check the shape of the dataset
element_spec = dataset.element_spec
print("Input shape:", element_spec[0].shape)
print("Target shape:", element_spec[1].shape)

# Try to get the first batch
try:
    for batch in dataset.take(1):
        inputs, targets = batch
        print("Actual input shape:", inputs.shape)
        print(inputs)
        print("Actual target shape:", targets.shape)
        print(targets)
except Exception as e:
    print(f"Error when trying to get the first batch: {e}")

num_batches = sum(1 for _ in dataset)
print(f"Number of batches: {num_batches}")

class SequenceEncoder(tf.keras.layers.Layer):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(SequenceEncoder, self).__init__()
        self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.lstm1 = tf.keras.layers.LSTM(lstm_units, return_sequences=True)
        self.lstm2 = tf.keras.layers.LSTM(lstm_units, return_sequences=False)
        self.dense = tf.keras.layers.Dense(lstm_units, activation='relu')
        self.lstm_units = lstm_units
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.lstm1(x)
        x = self.lstm2(x)
        return self.dense(x)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.lstm_units)

def build_hierarchical_lstm_model(vocab_size, max_seq_length, embedding_dim=256, lstm_units=256):
    function_input = tf.keras.layers.Input(shape=(None, max_seq_length), dtype=tf.int32)
    
    sequence_encoder = SequenceEncoder(vocab_size, embedding_dim, lstm_units)
    
    encoded_sequences = tf.keras.layers.TimeDistributed(sequence_encoder)(function_input)
    
    function_lstm = tf.keras.layers.LSTM(lstm_units)(encoded_sequences)
    
    output = tf.keras.layers.Dense(vocab_size, activation='softmax')(function_lstm)
    
    model = tf.keras.Model(inputs=function_input, outputs=output)
    
    return model

vocab_size = tokenizer.vocab_size
max_seq_length = 512
model = build_hierarchical_lstm_model(vocab_size, max_seq_length)

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

model.summary()

epochs = 10
steps_per_epoch = 100

history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)

print("Training completed.")

Token indices sequence length is longer than the specified maximum sequence length for this model (1703 > 512). Running this sequence through the model will result in indexing errors


Input shape: (None, None, 512)
Target shape: (None, None)
Actual input shape: (1, 4, 512)
tf.Tensor(
[[[    0 50117 50117 ...  1437  1437   403]
  [  230  3808 21770 ... 38323  5457 31245]
  [ 6972 10463  1215 ...  1437  1437  1437]
  [48565 23770  2562 ...     1     1     1]]], shape=(1, 4, 512), dtype=int32)
Actual target shape: (1, 13)
tf.Tensor(
[[    0  1215 16993  1182  6634  1215   438 38914 29015   176 11828 10887
      2]], shape=(1, 13), dtype=int32)
Number of batches: 100




Epoch 1/10


ValueError: Exception encountered when calling TimeDistributed.call().

[1mInvalid dtype: NoneType[0m

Arguments received by TimeDistributed.call():
  • inputs=tf.Tensor(shape=(None, None, 512), dtype=int32)
  • training=True
  • mask=None

In [68]:
import json
import tensorflow as tf
from transformers import RobertaTokenizer
import numpy as np

tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

def read_functions_from_json(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # Remove any leading/trailing whitespace
    content = content.strip()
    
    # Check if the content starts and ends with square brackets
    if content.startswith('[') and content.endswith(']'):
        # Treat as a JSON array
        try:
            data = json.loads(content)
            for item in data:
                yield item
        except json.JSONDecodeError as e:
            print(f"Error decoding JSON array: {e}")
    else:
        # Treat as individual JSON objects, one per line
        for line in content.split('\n'):
            line = line.strip()
            if line:
                try:
                    yield json.loads(line)
                except json.JSONDecodeError as e:
                    print(f"Error decoding JSON object: {e}")
                    print(f"Problematic line: {line}")
                    continue

def preprocess_function(function_text, max_seq_length=512):
    # Split the function declaration (assumed to be the first line) from the body
    lines = function_text.split('\n')
    declaration = lines[0]
    body = '\n'.join(lines[1:])
    
    # Extract function name from declaration
    func_name = declaration.split('(')[0].split()[-1]
    # print("body", body)
    # Tokenize the function body
    tokens = tokenizer.encode(body, add_special_tokens=True)
    # print("tokens", tokens)
    # print("shape of tokens", len(tokens))
    # Split into sequences of max_seq_length
    sequences = [tokens[i:i+max_seq_length] for i in range(0, len(tokens), max_seq_length)]
    # Pad the last sequence if necessary
    if len(sequences[-1]) < max_seq_length:
        sequences[-1] = sequences[-1] + [tokenizer.pad_token_id] * (max_seq_length - len(sequences[-1]))
    
    return np.array(sequences, dtype=np.int32), func_name

def create_dataset(json_file_path, batch_size=1):
    def gen():
        for func_obj in read_functions_from_json(json_file_path):
            function_text = func_obj.get('func')
            if function_text is None:
                print(f"Skipping object, no function text found: {func_obj}")
                continue
            sequences, name = preprocess_function(function_text)
            # print("name", name, tokenizer.encode(name, add_special_tokens=True))
            if sequences is None or name is None:
                print(f"Skipping, preprocessed data is None for: {function_text}")
                continue
            # yield sequences, tokenizer.encode(name, add_special_tokens=True)
            yield tf.cast(sequences, tf.int32), tf.cast(tokenizer.encode(name, add_special_tokens=True), tf.int32)
    
    return tf.data.Dataset.from_generator(
        gen,
        output_signature=(
            tf.TensorSpec(shape=(None, 512), dtype=tf.int32),
            tf.TensorSpec(shape=(None,), dtype=tf.int32)
        )
    ).padded_batch(batch_size)

# Create the dataset
# json_file_path = 'output_chunks/chunk_3305.json'
json_file_path = 'output_chunks/chunk_1.json'

dataset = create_dataset(json_file_path)

# Check the shape of the dataset
element_spec = dataset.element_spec
print("Input shape:", element_spec[0].shape)
print("Target shape:", element_spec[1].shape)

# Try to get the first batch
try:
    for batch in dataset.take(1):
        inputs, targets = batch
        print("Actual input shape:", inputs.shape)
        print(inputs)
        print("Actual target shape:", targets.shape)
        print(targets)
except Exception as e:
    print(f"Error when trying to get the first batch: {e}")

Token indices sequence length is longer than the specified maximum sequence length for this model (1703 > 512). Running this sequence through the model will result in indexing errors


Input shape: (None, None, 512)
Target shape: (None, None)
Actual input shape: (1, 4, 512)
tf.Tensor(
[[[    0 50117 50117 ...  1437  1437   403]
  [  230  3808 21770 ... 38323  5457 31245]
  [ 6972 10463  1215 ...  1437  1437  1437]
  [48565 23770  2562 ...     1     1     1]]], shape=(1, 4, 512), dtype=int32)
Actual target shape: (1, 13)
tf.Tensor(
[[    0  1215 16993  1182  6634  1215   438 38914 29015   176 11828 10887
      2]], shape=(1, 13), dtype=int32)


In [69]:
num_batches = sum(1 for _ in dataset)
print(f"Number of batches: {num_batches}")

Number of batches: 100


In [70]:
import tensorflow as tf
from tensorflow.keras import layers, Model

class SequenceEncoder(layers.Layer):
    def __init__(self, vocab_size, embedding_dim, lstm_units):
        super(SequenceEncoder, self).__init__()
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)
        self.lstm1 = layers.LSTM(lstm_units, return_sequences=True)
        self.lstm2 = layers.LSTM(lstm_units, return_sequences=False)
        self.dense = layers.Dense(lstm_units, activation='relu')
        self.lstm_units = lstm_units
    
    def call(self, inputs):
        x = self.embedding(inputs)
        x = self.lstm1(x)
        x = self.lstm2(x)
        return self.dense(x)
    
    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.lstm_units)

def build_hierarchical_lstm_model(vocab_size, max_seq_length, embedding_dim=256, lstm_units=256):
    # Input layer for multiple sequences (variable number of sequences per function)
    # function_input = layers.Input(shape=(None, max_seq_length))
    function_input = tf.keras.layers.Input(shape=(None, max_seq_length), dtype=tf.int32)
    
    # Sequence encoder layer
    sequence_encoder = SequenceEncoder(vocab_size, embedding_dim, lstm_units)
    
    # Apply the sequence encoder to each sequence in the function
    encoded_sequences = layers.TimeDistributed(sequence_encoder)(function_input)
    
    # LSTM layer to process all encoded sequences
    function_lstm = layers.LSTM(lstm_units)(encoded_sequences)
    
    # Output layer
    output = layers.Dense(vocab_size, activation='softmax')(function_lstm)
    
    # Full model
    model = Model(inputs=function_input, outputs=output)
    
    return model

# Create the model
vocab_size = tokenizer.vocab_size
max_seq_length = 512
model = build_hierarchical_lstm_model(vocab_size, max_seq_length)

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Print model summary
model.summary()



In [71]:
# import tensorflow as tf
# import numpy as np

# # Assuming your dataset is already prepared and called 'dataset'

# # Training parameters
# epochs = 10
# steps_per_epoch = 100  # Adjust based on your dataset size

# # Train the model
# history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)

# print("Training completed.")

# Training parameters
epochs = 10
steps_per_epoch = 100  # Adjust based on your dataset size

# Train the model
history = model.fit(dataset, epochs=epochs, steps_per_epoch=steps_per_epoch)

print("Training completed.")

Epoch 1/10


ValueError: Exception encountered when calling TimeDistributed.call().

[1mInvalid dtype: NoneType[0m

Arguments received by TimeDistributed.call():
  • inputs=tf.Tensor(shape=(None, None, 512), dtype=int32)
  • training=True
  • mask=None

In [None]:

# Function to predict function name
def predict_function_name(model, function_body):
    sequences, _ = preprocess_function(function_body)
    sequences = tf.expand_dims(sequences, axis=0)  # Add batch dimension
    predicted_tokens = model.predict(sequences)
    
    # Get the index of the most likely token for each position
    predicted_indices = np.argmax(predicted_tokens, axis=-1)[0]
    
    # Decode the indices to get the predicted name
    predicted_name = tokenizer.decode(predicted_indices)
    
    # Remove any special tokens and whitespace
    predicted_name = predicted_name.strip().replace(tokenizer.pad_token, "").replace(tokenizer.eos_token, "")
    
    return predicted_name

# Example usage
function_body = """
def example_function(x, y):
    result = x + y
    print(f"The sum of {x} and {y} is {result}")
    return result
"""

predicted_name = predict_function_name(model, function_body)
print(f"Function body:\n{function_body}")
print(f"Predicted function name: {predicted_name}")

# Try a few more examples
more_examples = [
    """
    def calculate_average(numbers):
        total = sum(numbers)
        count = len(numbers)
        return total / count if count > 0 else 0
    """,
    """
    def is_prime(n):
        if n < 2:
            return False
        for i in range(2, int(n**0.5) + 1):
            if n % i == 0:
                return False
        return True
    """
]

for i, example in enumerate(more_examples, 1):
    predicted_name = predict_function_name(model, example)
    print(f"\nExample {i}:")
    print(f"Function body:\n{example}")
    print(f"Predicted function name: {predicted_name}")