In [8]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [9]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt

In [10]:
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/nlp project CA/sentence completion /grammer.csv')
text = [x for x, y in zip(df.input.values, df.labels.values) if y == 1]


In [11]:
# Preprocessing text
corpus = [line.split(" ") for line in text]
corpus = corpus[:2000]  # Limit dataset size

In [12]:
# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print(f"Total words: {total_words}")

Total words: 3284


In [13]:
# Create input sequences
input_sequences = []
labels = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i]
        input_sequences.append(n_gram_sequence)
        labels.append(token_list[i])

In [14]:
# Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [15]:
# Convert labels to categorical
xs = input_sequences
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)


In [16]:
# Build model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=240, input_length=max_sequence_len),
    Bidirectional(LSTM(150)),
    Dense(total_words, activation="softmax")
])



In [17]:
# Compile model
adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])


In [19]:
# Train model
history = model.fit(xs, ys, epochs=40, verbose=1)

Epoch 1/40


KeyboardInterrupt: 

In [None]:
# Save model weights
model.save_weights('my_model.weights.h5')

In [None]:
def plot_graphs(history, metric):
    plt.plot(history.history[metric])
    plt.xlabel("Epochs")
    plt.ylabel(metric)
    plt.show()

In [None]:
# Plot accuracy and loss
plot_graphs(history, 'accuracy')
plot_graphs(history, 'loss')

In [None]:
# Load model weights for inference
model.load_weights('my_model.weights.h5')

In [None]:
# Text generation function
def generate_text(seed_text, next_words=10):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " " + output_word
        if output_word == '.':
            break
    return seed_text

In [None]:
# Interactive loop for text generation
while True:
    seed_text = input("Enter sentence (type '***' to stop): ")
    if seed_text == "***":
        break
    print(generate_text(seed_text))