In [1]:
import numpy as np

# Set the random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
num_samples = 1000
max_length = 500  # Maximum length of the essays
vocab_size = 10000  # Vocabulary size for the tokenizer

essays = []
scores = []

for _ in range(num_samples):
    essay_length = np.random.randint(100, max_length)
    essay = ' '.join(np.random.choice(vocab_size, essay_length).astype(str))
    score = np.random.randint(0, 10)
    essays.append(essay)
    scores.append(score)

# Convert scores to numpy array
scores = np.array(scores)

In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

# Tokenize the text
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(essays)
word_index = tokenizer.word_index

# Convert text to sequences of integers
sequences = tokenizer.texts_to_sequences(essays)

# Pad sequences to a fixed length
max_length = 500
data = sequence.pad_sequences(sequences, maxlen=max_length)
# Split the data into training and testing sets
train_ratio = 0.8
train_samples = int(len(data) * train_ratio)

x_train = data[:train_samples]
y_train = scores[:train_samples]

x_test = data[train_samples:]
y_test = scores[train_samples:]

# Build and train the LSTM model
embedding_dim = 100
hidden_units = 64

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(hidden_units))
model.add(Dense(1, activation='linear'))

model.compile(loss='mean_squared_error', optimizer='adam')
model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))

# Evaluate the trained model
loss = model.evaluate(x_test, y_test)
print('Test Loss:', loss)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 8.765266418457031


In [5]:
import pickle

# After model training
model.save('essay_grading_model.h5')

# Save the tokenizer
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)