In [None]:
#Optional code to download Glove pretrained Embeddings
# from urllib.request import urlretrieve
# import os
# from zipfile import ZipFile

# def download(url, file):
#     if not os.path.isfile(file):
#         print("Download file... " + file + " ...")
#         urlretrieve(url,file)
#         print("File downloaded")

# download('http://nlp.stanford.edu/data/glove.6B.zip','Glove.zip')
# print("All the files are downloaded")
# def uncompress_features_labels(dir):
#     if(os.path.isdir('data')):
#         print('Data extracted')
#     else:
#         with ZipFile(dir) as zipf:
#             zipf.extractall('data')
# uncompress_features_labels('Glove.zip')

In [None]:
#Importing libraries
import string
import re
import os
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np
import tensorflow.keras
from tensorflow.keras.layers import Embedding,Dense,LSTM,CuDNNLSTM,Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import sys
from random import randint
from tensorflow.keras.preprocessing.sequence import pad_sequences
import random

In [None]:
#Function to load corpus
def load_text(filename):
  file=open(filename,'r')
  text=file.read()
  file.close()
  return text

# Word Based Text Generation for Autocompleting Sentences

In [None]:
VOCAB_SIZE = 30000
EMBEDDING_DIM = 128

MAX_LEN = 10
MIN_WORDS = 4

PADDING = 'post'
TRUNCATING = 'pre'

BATCH_SIZE = 256
EPOCHS = 100

In [None]:
#Function for preprocessing the text
def clean_text_word(text):
  text=text.replace('.',' .')
  tokens = text.split()
  punc=string.punctuation
  punc=punc.replace('.','')
  punc=punc + str("\\")
  table=str.maketrans('','',punc)
  tokens = [w.translate(table) for w in tokens]

  tokens = (word for word in tokens if not word.isdigit())

  tokens = [word.lower() for word in tokens]

  return tokens

In [None]:
#Function for separating all sentences from the text
def generate_sentences(tokens):
  sentences=list()
  sentence=list()
  for word in tokens:
    if(word=='.'):
      sentence.append(word)
      sentences.append(sentence)
      sentence=list()
    else:
      sentence.append(word)
  return sentences

In [None]:
#Function for generating sequences which is used to build the train data
def generate_sequences(sentences):
  sequences=list()
  split=1/3
  for sentence in sentences:
    for i in range(int(len(sentence)*split),len(sentence)):
    # print(i)
    # print(i-int(len(sentence)/4))
      seq=sentence[i-int(len(sentence)*split):i+1]
      line=' '.join(seq)
      sequences.append(line)
  return sequences

In [None]:
raw_text=load_text('internet_archive_scifi_v3.txt')
tokens=clean_text_word(raw_text)

sentences=generate_sentences(tokens[700:2000000])

sequences=generate_sequences(sentences)

tokenizer=Tokenizer(filters='!"#$%&()*+,-/:;<=>?@[\]^_`{|}~\t\n',num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(sequences)

sen_sequences=tokenizer.texts_to_sequences(sequences)
sen_sequences=np.array(sen_sequences)

In [None]:
X,y=list(),list()
for sentence in sen_sequences:
  if(len(sentence)<MIN_WORDS): #Asserting minimum number of words in a sentence
    continue
  X.append(sentence[:-1])
  y.append(sentence[-1])



In [None]:
X=np.array(X)
y=np.array(y)
y=y.reshape((-1,1))
X_pad=list()
#Padding all train data to have equal length
for data in X:
  pad=pad_sequences([data],maxlen=MAX_LEN,padding=PADDING,truncating=TRUNCATING)[0]
  X_pad.append(pad)

X_pad=np.array(X_pad)
seq_length=X_pad.shape[1]

In [None]:
#Function for loading the pretrained glove embeddings
# def load_glove_embeddings(text_location):
#   glove_embeddings={}
#   f=open(text_location)
#   for line in f:
#     values=line.split()
#     word=values[0]
#     coefs=np.asarray(values[1:],dtype='float32')
#     glove_embeddings[word]=coefs
#   f.close()
#   return glove_embeddings

# glove_embeddings=load_glove_embeddings('data/glove.6B.200d.txt')

# EMBEDDING_DIM=200 #Change according to the emedding dimension of the file used
# word_index=tokenizer.word_index
# glove_matrix=np.zeros((len(word_index)+1,EMBEDDING_DIM))
# for word,i in word_index.items():
#   vector=glove_embeddings.get(word)
#   if vector is not None:
#     glove_matrix[i]=vector



In [None]:
model=Sequential()
# model.add(Embedding(len(word_index)+1,EMBEDDING_DIM,weights=[glove_matrix],input_length=seq_length,trainable=False))
model.add(Embedding(VOCAB_SIZE,EMBEDDING_DIM,input_length=seq_length))
model.add(CuDNNLSTM(100,return_sequences=True))
model.add(CuDNNLSTM(100,return_sequences=True))
model.add(Dropout(0.2))
model.add(CuDNNLSTM(100,return_sequences=True))
model.add(CuDNNLSTM(100,return_sequences=True))
model.add(CuDNNLSTM(100))
#Use the commented code instead of CuDNN if using CPU to train 
# model.add(LSTM(128,return_sequences=True,recurrent_dropout=0.2))
# model.add(LSTM(128,return_sequences=True,recurrent_dropout=0.2))
# model.add(LSTM(128))
model.add(Dense(512,activation='relu'))
model.add(Dense(VOCAB_SIZE,activation='softmax'))

In [None]:
if os.path.exists('weights/'):
  print("Weights folder already exists")
else:
  os.mkdir('weights/')
  print("weights folder created")

model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['acc'])
filepath="weights/weights-improvement-{epoch:02d}-{acc:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath,'acc',1,True,mode='max')


In [None]:
#Generator function to divide into batches and train manually
def text_generator_v3(X, labels,vocab_size, batchsize, mode="train"):
    while True:
        start = 0
        end = batchsize

        while start  < len(X): 
            x = X[start:end] 
            y = labels[start:end]
            y = tensorflow.keras.utils.to_categorical(y,vocab_size)
            yield x, y

            start += batchsize
            end += batchsize

In [None]:

model.fit_generator(text_generator_v3(X_pad,y,VOCAB_SIZE,BATCH_SIZE),steps_per_epoch=X_pad.shape[0]//BATCH_SIZE,epochs=EPOCHS,verbose=1,callbacks=[checkpoint])

In [None]:
def generate_text(text,model,tokenizer,reverse_word_index):
  intext=text.lower()
  # increment=0
  sentence=string
  sys.stdout.write(text+' ')
  while(True):
    seq=tokenizer.texts_to_sequences([intext])
    pad=pad_sequences(seq,maxlen=10,padding='post',truncating='pre')
    pred=model.predict(pad)
    # top=model.predict_classes(pad)
    top2=pred[0].argsort()[-2:][::-1]
    word=np.random.choice(top2,p=[0.7,0.3])
    word=reverse_word_index[word]
    sys.stdout.write(word+' ')
    sentence = ' '.join(word)
    intext=intext+' '+word
    # increment=increment+1
    if(word=='.'):
      break
  return sentence


In [None]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])
# model.load_weights('weights/weights-improvement-10-0.44.hdf5')

In [None]:
text='Their lights glitter within'
gen_text=generate_text(text,model,tokenizer,reverse_word_index)
# generated_sentences.append("epoch : 2 "+gen_text)

#                        Character Based Text Generation for Autocompleting Sentences

In [None]:
MAXLEN = 25
EMBEDDING_DIM = 32
EPOCHS = 5
BATCH_SIZE = 1024

In [None]:
raw_text=load_text('internet_archive_scifi_v3.txt')

In [None]:
def clean_text_char(text):
  text=re.sub(r"\b[a-zA-Z]\b", "", text)
  text=text.replace('.',' .')
  tokens = text.split()
  punc=string.punctuation
  punc=punc.replace('.','')
  punc=punc + str("\\")
  table=str.maketrans('','',punc)
  tokens = [w.translate(table) for w in tokens]

  tokens = (word for word in tokens if not word.isdigit())

  tokens = [word.lower() for word in tokens]

  text = ' '.join(tokens)

  return text

In [None]:
upd_text=clean_text_char(raw_text)

In [None]:
#Creating dictionary for char vocabulary
chars=sorted(list(set(upd_text)))
char_to_int = dict((c, i) for i,c in enumerate(chars))
int_to_char = dict((c,i) for c,i in enumerate(chars)) 
vocab_size=len(char_to_int)


In [None]:
def generate_sentences(tokens):
  sentences=list()
  sentence=list()
  for word in tokens:
    if(word=='.'):
      sentence.append(word)
      if(len(sentence)<5):
        sentence=list()
        continue
      sentences.append(sentence)
      sentence=list()
    else:
      sentence.append(word)
  return sentences

In [None]:
sentences=generate_sentences(upd_text.split()[700:])

In [None]:
def generate_char_sequences(sentences):
  sequences=list()
  split = 0.25
  X,y=list(),list()
  for i in range(len(sentences)):
    sentences[i]=' '.join(sentences[i])
  for sentence in sentences:
    # interval=len(sentence)-int(len(sentence)*split)
    interval=int(len(sentence)*split)
    for i in range(0,len(sentence)-interval):
      x_seq=sentence[i:i+interval]
      y_seq=sentence[i+interval]
      X.append([char_to_int[char] for char in x_seq])
      y.append(char_to_int[y_seq])
  return X,y
       

In [None]:
#Generate character sequences 
X,y=generate_char_sequences(sentences[:500000])
X,y = np.array(X), np.array(y)

In [None]:
data_folder='data/'
if os.path.exists(data_folder):
  print("Folder exists")
else:
  os.mkdir(data_folder)
  print("Data folder created")

filename='X_y_data.npz'
print(os.path.join(data_folder,filename))
#Saving the generated sequences
np.savez_compressed(os.path.join(data_folder,filename),a=X,b=y)

In [None]:
data=np.load(os.path.join(data_folder,filename),allow_pickle=True)

In [None]:
X=data['a']
y=data['b']

In [None]:
X=pad_sequences(X,maxlen=25,padding='post',truncating='pre')
y = y.reshape((-1,1))
y = tensorflow.keras.utils.to_categorical(y)

In [None]:
model=Sequential()
model.add(Embedding(vocab_size,32,input_length=MAXLEN))
model.add(CuDNNLSTM(128,return_sequences=True))
model.add(CuDNNLSTM(128,return_sequences=True))
model.add(CuDNNLSTM(128,return_sequences=True))
model.add(CuDNNLSTM(128))
model.add(BatchNormalization(momentum=0.8))
model.add(Dense(512,activation='relu'))
model.add(Dense(1024,activation='relu'))
model.add(Dense(vocab_size,activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(),metrics=['acc'])
# os.mkdir('drive/My Drive/SciFi Text Generation/weights/char_700k_weights')
call_backs=ModelCheckpoint('drive/My Drive/SciFi Text Generation/weights/char_500k_weights/weights-improvement-{epoch:02d}-{acc:.2f}.hdf5',monitor='acc',verbose=1,save_best_only=True,mode='max')

In [None]:
model.fit(X,y,batch_size=BATCH_SIZE,epochs=EPOCHS,verbose=1,callbacks=[call_backs])

In [None]:
#Function for creating batches
# def generator_v4(sentences, batch_size,vocab_size,maxlen=25):
#   sentences=np.array(sentences)
#   while True:
#     idx = np.random.choice(len(sentences),batch_size)
#     X,y = generate_char_sequences(sentences[[idx]])
#     X,y = np.array(X), np.array(y)
#     X = pad_sequences(X, maxlen=maxlen, padding='post', truncating='pre')
#     y = tensorflow.keras.utils.to_categorical(y,vocab_size)
#     yield X,y


# batch_size=512
# model.fit_generator(generator_v4(sentences[:500000],batch_size,vocab_size),steps_per_epoch=len(sentences[:500000])//batch_size,epochs=100,verbose=1,callbacks=[call_backs])
  

In [None]:
def generate_text_char(text,model,char_to_int,int_to_char):
  intext=list()
  sys.stdout.write(text)
  text=text.lower()
  for char in text:
    intext.append(char_to_int[char])
  # intext=np.array(intext)
  while(True):
    pad=pad_sequences([intext],maxlen=25,padding='pre',truncating='pre')
    pred=model.predict(pad)
    top2=pred[0].argsort()[-2:][::-1]
    choice=np.random.choice(top2,p=[0.7,0.3])
    char=int_to_char[choice]
    sys.stdout.write(char)
    intext.append(choice)
    if(char=='.'):
      break

In [None]:
text='Beneath the willow there are'
generate_text_char(text,model,char_to_int,int_to_char)