In [1]:
import os
import json
import numpy as np
import tensorflow as tf

In [2]:
print(tf.__version__)

2.8.0


In [3]:
BATCH_SIZE = 16
SEQUENCE_LENGTH = 64

In [9]:
# Path to the input data directory
input_data_dir = "/content/drive/MyDrive/NeuralNets/Project/data/"
combined_fname = "combined.txt"
char_to_idx_fname = "char_to_index.json"
combined_file_path = input_data_dir + combined_fname

In [10]:
train_data_path = input_data_dir + "train.npy"
label_data_path = input_data_dir + "labels.npy"

In [13]:
# Combine all text file contents to a single file
with open(combined_file_path, "w") as rp:
  for fname in os.listdir(input_data_dir):
    if fname.endswith(".txt") and fname not in ["combined.txt"]:
      print("Processing file: ", fname)
      f_path = os.path.join(input_data_dir, fname)
      with open(f_path, "r") as fp:
        content = fp.read()

      rp.write(content)
      rp.write("\n")

Processing file:  ash.txt
Processing file:  christ.txt
Processing file:  hornpipes.txt
Processing file:  jigs.txt
Processing file:  morris.txt
Processing file:  mq.txt
Processing file:  playford.txt
Processing file:  reelsac.txt
Processing file:  reelsdg.txt
Processing file:  reelshl.txt
Processing file:  rt.txt
Processing file:  slipjigs.txt
Processing file:  uz.txt
Processing file:  waltzes.txt


In [14]:
#return file content of a text file when given a path
def read_file(path):
  with open(path, "r") as fp:
    content = fp.read()
  return content

In [16]:
# Read file and get char to index mapping
text = read_file(combined_file_path)
char_to_index = {ch: i for (i, ch) in enumerate(sorted(list(set(text))))}

with open(os.path.join(input_data_dir, char_to_idx_fname), "w") as fp:
  json.dump(char_to_index, fp)
  print("Json file successfully written")

In [17]:
len(list(text))

452499

In [None]:
#convert complete text into numerical indices
T = np.asarray([char_to_index[c] for c in text], dtype=np.int32) 

#VOCABULARY SIZE
VOCAB_SIZE = len(char_to_index)

print(VOCAB_SIZE)

93


In [None]:
def read_batches(T, vocab_size):
  length = T.shape[0]; 
  batch_chars = int(length / BATCH_SIZE); # 8,104
  train = []
  labels = []

  for start in range(0, batch_chars - SEQUENCE_LENGTH, SEQUENCE_LENGTH): # (0, 8040, 64)
      X = np.zeros((BATCH_SIZE, SEQUENCE_LENGTH)) # 16X64
      Y = np.zeros((BATCH_SIZE, SEQUENCE_LENGTH, vocab_size)) # 16X64X86
      for batch_idx in range(0, BATCH_SIZE): # (0,16)
          for i in range(0, SEQUENCE_LENGTH): #(0,64)
              X[batch_idx, i] = T[batch_chars * batch_idx + start + i] # 
              Y[batch_idx, i, T[batch_chars * batch_idx + start + i + 1]] = 1
      train.append(X)
      labels.append(Y)

  return np.array(train), np.array(labels)

In [None]:
train, labels = read_batches(T,VOCAB_SIZE)

In [None]:
print(train.shape)
print(labels.shape)

(441, 16, 64)
(441, 16, 64, 93)


In [None]:
np.save(train_data_path, train)
np.save(label_data_path, labels)