In [None]:
import numpy as np
import tensorflow as tf
from os import listdir
from os.path import isfile, join
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

data_path = "/path_to_your_data_folder/"


positiveFiles = [data_path + 'positiveReviews/' + f for f in listdir(data_path + 'positiveReviews/') if isfile(join(data_path + 'positiveReviews/', f))]
negativeFiles = [data_path + 'negativeReviews/' + f for f in listdir(data_path + 'negativeReviews/') if isfile(join(data_path + 'negativeReviews/', f))]

In [None]:
def tokenFiles(files,maxNumWords,tk,maxNumFiles=None):
    # tk: instance of keras tokenizer class
    fileCounter = 0
    
    for fname in files:
        with open(fname, "r", encoding='utf-8') as f:        
            doc=f.readlines()
            # fit the tokenizUsing TensorFlow backend.er on the documents
            tk.fit_on_texts(doc)
        
        if (fileCounter % 100 == 0 and fileCounter > 0):
            print(fileCounter, len(t.word_counts))
        
        if (maxNumFiles is not None and fileCounter == maxNumFiles-1):
            break
            
        fileCounter += 1 
            
    return tk

In [None]:
def encodeFiles(files,maxSeqLength,tk,maxNumFiles):
    # tk: instance of keras tokenizer class
    ids = np.zeros((maxNumFiles, maxSeqLength), dtype='int32')
    fileCounter = 0 
    
    for fname in files:
        with open(fname, "r", encoding='utf-8') as f:        
            doc=f.readlines()
            # encode the document
            id = tk.texts_to_sequences(doc)
            # form a fixed length Using TensorFlow backend.sequence for each doc
            idp = pad_sequences(id, maxlen=maxSeqLength, dtype='int32', padding='post', truncating='post', value=0.0) 
            # form an array for all documents
            ids[fileCounter,:] = idp
        
        if (fileCounter % 100 == 0 and fileCounter > 0):
            print(fileCounter)
        
        if fileCounter == maxNumFiles - 1:
            break
            
        fileCounter += 1

    return ids

In [None]:
# create the tokenizer
MAX_NUM_WORDS = 40000 # only use top n words
maxNumFiles = 10 # only apply to this numner of files
maxSeqLength = 250 #Maximum number of words for each document Document level sentiment analysis obtains the sentiment of a complete document or paragraph.

t = Tokenizer(num_words = MAX_NUM_WORDS, oov_token='UNK')
#t = tf.keras.preprocessing.text.Tokenizer(num_words = MAX_NUM_WORDS, oov_token='UNK') # this api in tf is not working correctly for attribute "document_count"   

In [None]:
# load all files to create wordsList

# positive reviews
t = tokenFiles(positiveFiles,MAX_NUM_WORDS,t)

print('Positive files finished')
# summarize what was learned
print(len(t.word_counts))
print(t.document_count)

# negative reviews
t = tokenFiles(negativeFiles,MAX_NUM_WORDS,t)

print('All files finished') 

# summarize what was learned
print(len(t.word_counts))
print(t.document_count)

# only save top MAX_NUM_WORDS-1 words, leave 0 for padding blanks
t.word_index = {e:i for e,i in t.word_index.items() if i < MAX_NUM_WORDS} # tokenizer is 1 indexed 
 
# save Word index file
np.save("wordIndex_all", t.word_index)

# encode all files based on tokenizer

# positive reviews
ids_p = encodeFiles(positiveFiles,maxSeqLength,t,len(positiveFiles))
np.save("idsMatrix_pos", ids_p)

# negative reviews
ids_n = encodeFiles(negativeFiles,maxSeqLength,t,len(negativetiveFiles))
np.save("idsMatrix_neg", ids_n)

# merge all files
all_ids = np.concatenate((ids_p, ids_n), axis=0)
np.save("idsMatrix_all", all_ids)

In [None]:
# load preprocessed data if wordIndex and idsMatrix files have been created.

wordsList = np.load('wordIndex_all.npy')

wordsList = wordsList.tolist() #Originally loaded as numpy array

#index_to_word = {i: w for w, i in t.word_index.items()}
# Reverse from integers to words using the DICTIONARY
reverse_word_index = dict(
[(value, key) for (key, value) in wordsList.items()])


ids = np.load('idsMatrix_all.npy')

#decoded_review = ' '.join([reverse_word_index.get(i, '?') for i in ids[0]])