In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer  
from tensorflow.keras.preprocessing.sequence import pad_sequences  
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset



c:\Users\warun\anaconda3\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas.FB5AE2TYXYH2IJRDKGDGQ3XBKLKTF43H.gfortran-win_amd64.dll
c:\Users\warun\anaconda3\envs\tensorflow\lib\site-packages\numpy\.libs\libopenblas64__v0.3.23-gcc_10_3_0.dll


In [2]:
# Load the code dataset
data = pd.read_csv('code.csv')

In [3]:
data

Unnamed: 0,code_snippet,expected_output
0,a = 5\nb = 0\nprint(a / b),ZeroDivisionError on line 3
1,print(undefined_var),NameError on line 1
2,x = 10\nx += '5',TypeError on line 2
3,"['apple', 'banana', 'cherry'][3]",IndexError on line 1
4,class MyClass:\n def __init__(self):\n ...,AttributeError on line 3
...,...,...
431,a = '5'\nb = int('a')\nresult = a + b,TypeError on line 3
432,def custom_function(*args):\n return args[1...,IndexError on line 2
433,try:\n x = int('abc')\nexcept ValueError as...,ValueError on line 2
434,import non_existent_module,ModuleNotFoundError on line 1


In [4]:
# Encode the expected_output column
label_encoder = LabelEncoder()
data['expected_output_encoded'] = label_encoder.fit_transform(data['expected_output'])

# Tokenize the code snippets
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['code_snippet'])
total_words = len(tokenizer.word_index) + 1

# Convert text to sequences and pad them to a fixed length
input_sequences = tokenizer.texts_to_sequences(data['code_snippet'])
padded_sequences = pad_sequences(input_sequences)

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, data['expected_output_encoded'], test_size=0.2, random_state=42)

In [6]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.long)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.long)  # Use torch.long for multi-class classification
X_test_tensor = torch.tensor(X_test, dtype=torch.long)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.long)  # Use torch.long for multi-class classification

# Create DataLoader for training and testing
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Define the LSTM model for multi-class classification
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output

# Initialize the model for multi-class classification
embedding_dim = 50
hidden_dim = 100
output_dim = len(data['expected_output'].unique())  # Number of unique error types
model = LSTMModel(embedding_dim, hidden_dim, total_words, output_dim)

# Define loss and optimizer for multi-class classification
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [7]:
# Training the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for batch_x, batch_y in train_loader:
        optimizer.zero_grad()
        output = model(batch_x)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

In [8]:
# Evaluate the model
model.eval()
with torch.no_grad():
    predictions = []
    for batch_x, batch_y in test_loader:
        output = model(batch_x)
        _, predicted_labels = torch.max(output, 1)
        predictions.extend(predicted_labels.cpu().numpy())

# Calculate accuracy for multi-class classification
accuracy = accuracy_score(y_test, predictions)
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.8863636363636364


In [9]:
# Save the trained model
torch.save(model.state_dict(), 'lstm_model.pth')

In [10]:
# Load the saved model
loaded_model = LSTMModel(embedding_dim, hidden_dim, total_words, output_dim)
loaded_model.load_state_dict(torch.load('lstm_model.pth'))
loaded_model.eval()


LSTMModel(
  (embedding): Embedding(200, 50)
  (lstm): LSTM(50, 100, batch_first=True)
  (fc): Linear(in_features=100, out_features=27, bias=True)
)

In [13]:
#predict the error type

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Define the maximum sequence length used during training
MAX_SEQUENCE_LENGTH = 100

# Define the LSTM model class
class LSTMModel(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, output_dim):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        return output

# Instantiate the model with the same parameters
embedding_dim = 50
hidden_dim = 100
output_dim = len(data['expected_output'].unique())  # Number of unique error types
model = LSTMModel(embedding_dim, hidden_dim, total_words, output_dim)

# Load the saved model
model.load_state_dict(torch.load('lstm_model.pth'))
model.eval()

# Function to predict error type for a given code snippet
def predict_error(code_snippet, tokenizer):
    # Tokenize and pad the code snippet
    input_sequence = tokenizer.texts_to_sequences([code_snippet])
    padded_sequence = pad_sequences(input_sequence, maxlen=MAX_SEQUENCE_LENGTH)
    input_tensor = torch.tensor(padded_sequence, dtype=torch.long)

    # Make prediction
    with torch.no_grad():
        output = model(input_tensor)
        _, predicted_label = torch.max(output, 1)

    # Convert the predicted label to the original error type
    predicted_error = label_encoder.classes_[predicted_label.item()]
    
    return predicted_error

# Example usage
code_to_predict = """
a = 5
b = 0
print(a / b)
print(a+b)
print(b-a)
This is a test
"""
predicted_error = predict_error(code_to_predict, tokenizer)
print(f"Predicted Error: {predicted_error}")

# Function to get error output using existing tool if model fails to predict
def get_error_output(code):
    global_vars = {}
    error_report = []

    for line_number, line in enumerate(code.split('\n'), 1):
        try:
            exec(line, global_vars)
        except Exception as e:
            error_message = f"Error on line {line_number}: {str(e)}"
            error_report.append(error_message)
    
    return error_report

# Example usage
error_output = get_error_output(code_to_predict)
for error in error_output:
    print(error)


Predicted Error: Success
5
-5
Error on line 4: division by zero
Error on line 7: invalid syntax (<string>, line 1)
