<a href="https://colab.research.google.com/github/zlibutmatthew/Sentence-Completion-using-Keras/blob/main/Sentence_Completion_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import LSTM, Dense, GRU, Embedding
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras import optimizers
import tensorflow as tf
from tensorflow import keras
import torch

## Importing the Data

In [None]:
train_df1 = pd.read_csv('train.csv')
train_ls1=train_df1['text'].tolist()
train_ls1[0:5]

['it is todays experience that got hcc',
 'meh  needed parts got attitude',
 'went somewhere else got looked after',
 'he was told it would be about $350',
 'this is incredibly poor workmanship']

In [None]:
test_df1 = pd.read_csv('test.csv')
test_ls1=test_df1['Text'].tolist()
print(len(test_ls1))

92


## Preprocessing

In [None]:
#unknown character at index 20
test_ls1.pop(20)

'rented a 20’'

In [None]:
train_st=''
for item in train_ls1:
    train_st += ' ' + item + '.'

print(len(train_st))

415493


In [None]:
# Join all the sentences together and extract the unique characters from the combined sentences
chars = set(train_st)

# Creating a dictionary that maps integers to the characters
int2char = dict(enumerate(chars))

# Creating another dictionary that maps characters to integers
char2int = {char: ind for ind, char in int2char.items()}

print(char2int)

{"'": 0, ' ': 1, '€': 2, 'c': 3, '~': 4, '-': 5, ';': 6, 'l': 7, '?': 8, 'u': 9, 'é': 10, '(': 11, '8': 12, 'í': 13, 't': 14, '6': 15, 'z': 16, 'h': 17, '4': 18, 'g': 19, '5': 20, '@': 21, 'n': 22, ')': 23, '"': 24, 's': 25, 'o': 26, 'd': 27, '!': 28, 'k': 29, 'q': 30, '%': 31, '+': 32, '7': 33, 'b': 34, 'j': 35, 'e': 36, '3': 37, '9': 38, ':': 39, '0': 40, 'à': 41, 'm': 42, 'y': 43, 'a': 44, '.': 45, 'v': 46, '1': 47, '$': 48, '&': 49, '=': 50, 'p': 51, 'x': 52, '*': 53, 'f': 54, 'i': 55, '#': 56, '2': 57, 'w': 58, '×': 59, 'r': 60}


In [None]:
def create_seq(text):
    length = 30
    sequences = list()
    for i in range(length, len(text)):
        if text[i-length-1] == ' ':
            # select sequence of tokens
            seq = text[i-length:i+1]
            # store
            sequences.append(seq)
    print('Total Sequences: %d' % len(sequences))
    return sequences

# create sequences   
sequences = create_seq(train_st)
sequences[:10]

Total Sequences: 80250


['it is todays experience that go',
 'is todays experience that got h',
 'todays experience that got hcc.',
 'experience that got hcc. meh  n',
 'that got hcc. meh  needed parts',
 'got hcc. meh  needed parts got ',
 'hcc. meh  needed parts got atti',
 'meh  needed parts got attitude.',
 ' needed parts got attitude. wen',
 'needed parts got attitude. went']

In [None]:
# create a character mapping index
# chars = sorted(list(set(data_new)))
# mapping = dict((c, i) for i, c in enumerate(chars))

def encode_seq(seq):
    sequences = list()
    for line in seq:
        # integer encode line
        encoded_seq = [char2int[char] for char in line]
        # store
        sequences.append(encoded_seq)
    return sequences

# encode the sequences
sequences = encode_seq(sequences)
sequences[0:1]

[[55,
  14,
  1,
  55,
  25,
  1,
  14,
  26,
  27,
  44,
  43,
  25,
  1,
  36,
  52,
  51,
  36,
  60,
  55,
  36,
  22,
  3,
  36,
  1,
  14,
  17,
  44,
  14,
  1,
  19,
  26]]

In [None]:
from sklearn.model_selection import train_test_split

# vocabulary size
vocab = len(char2int)
sequences = np.array(sequences)
# create X and y
X, y = sequences[:,:-1], sequences[:,-1]
# one hot encode y
y = to_categorical(y, num_classes=vocab)
# create train and validation sets
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

print('Train shape:', X_tr.shape, 'Val shape:', X_val.shape)

Train shape: (72225, 30) Val shape: (8025, 30)


## Building and Training the Model

In [None]:
# define model
model = Sequential()
model.add(Embedding(vocab, 50, input_length=30, trainable=True))
model.add(GRU(150, recurrent_dropout=0.1, dropout=0.1))
model.add(Dense(vocab, activation='softmax'))
print(model.summary())

# compile the model
model.compile(loss='categorical_crossentropy', metrics=['acc'], optimizer='adam')
# fit the model
model.fit(X_tr, y_tr, epochs=30, verbose=2, validation_data=(X_val, y_val))

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 30, 50)            3050      
_________________________________________________________________
gru (GRU)                    (None, 150)               90900     
_________________________________________________________________
dense (Dense)                (None, 61)                9211      
Total params: 103,161
Trainable params: 103,161
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/30
2258/2258 - 186s - loss: 2.2961 - acc: 0.3349 - val_loss: 2.0046 - val_acc: 0.4037
Epoch 2/30
2258/2258 - 192s - loss: 1.8832 - acc: 0.4412 - val_loss: 1.7890 - val_acc: 0.4715
Epoch 3/30
2258/2258 - 187s - loss: 1.7231 - acc: 0.4859 - val_loss: 1.6732 - val_acc: 0.5029
Epoch 4/30
2258/2258 - 187s - loss: 1.6283 - acc: 0.5085 - val_loss: 1.6235 - val_acc: 0.5154

<tensorflow.python.keras.callbacks.History at 0x7f9f12367fd0>

## Predicting using the Model

In [None]:
# generate a sequence of characters with a language model
def generate_seq(model, mapping, seq_length, seed_text, n_chars):
    in_text = seed_text
    # generate a fixed number of characters
    for _ in range(n_chars):
        # encode the characters as integers
        encoded = [mapping[char] for char in in_text]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict character
        yhat = model.predict_classes(encoded, verbose=0)
        # reverse map integer to character
        out_char = ''
        for char, index in mapping.items():
            if index == yhat:
                out_char = char
                break
        # append to input
        in_text += char
    return in_text

In [None]:
for item in test_ls1:
    print([item, generate_seq(model, char2int, 30, item, 50)])

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
['awesome people to', 'awesome people to anyway are the best. we will be back. they are  s']
['return was', 'return was a great people. we will be back. they are  staff ']
['these guys are', 'these guys are the best. we will be back. they are  staff and ha']
['i have  rented from them', 'i have  rented from them. they are  staff and had a great job. no stars wa']
['what great individuals and', 'what great individuals and well come back. they are  staff and had a great j']
['i look forward to', 'i look forward to anyway and i was not  be back. they are  staff an']
['thanks for the', 'thanks for the staff and well come back. they are  staff and had']
['thanks 

In [None]:
# if you need to save the model to load somewhere else:
## model.save('Insert path here')
# if you need to load the model
## model = keras.models.load_model('Insert path here')