In [3]:
import pickle
import os
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

In [1]:
# Define the paths to dataset
train_dir_pos = 'aclImdb/train/pos'
train_dir_neg = 'aclImdb/train/neg'
train_dir_unsup = 'aclImdb/train/unsup'
test_dir_pos = 'aclImdb/test/pos'
test_dir_neg = 'aclImdb/test/neg'

# Function to load the positive reviews
def load_data_pos(dir):
    reviews = []
    labels = []
    # Load positive reviews
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append(1)
    return reviews, labels

# Function to load the negative reviews
def load_data_neg(dir):
    reviews = []
    labels = []
    # Load negative reviews
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
                labels.append(0)
    return reviews, labels

# Function to load the unsupervised reviews
def load_unsupervised_data(dir):
    reviews = []
    for fname in os.listdir(dir):
        if fname.endswith('.txt'):
            with open(os.path.join(dir, fname), encoding='utf-8') as f:
                review = f.read()
                reviews.append(review)
    return reviews

In [4]:
# Load training data
train_reviews_pos, train_labels_pos = load_data_pos(train_dir_pos)
train_reviews_neg, train_labels_neg = load_data_neg(train_dir_neg)
train_reviews = train_reviews_pos + train_reviews_neg
train_labels = train_labels_pos + train_labels_neg

# Load testing data
test_reviews_pos, test_labels_pos = load_data_pos(test_dir_pos)
test_reviews_neg, test_labels_neg = load_data_neg(test_dir_neg)
test_reviews = test_reviews_pos + test_reviews_neg
test_labels = test_labels_pos + test_labels_neg

# Save the reviews and labels to a file
with open('train_reviews.pkl', 'wb') as f:
    pickle.dump((train_reviews, train_labels), f)

with open('test_reviews.pkl', 'wb') as f:
    pickle.dump((test_reviews, test_labels), f)

# Load unsupervised reviews
unsup_reviews = load_unsupervised_data(train_dir_unsup)

with open('unsup_reviews.pkl', 'wb') as f:
     pickle.dump(unsup_reviews, f)

In [16]:
# Tokenize the reviews
max_words = 10000
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(train_reviews)
x_train = tokenizer.texts_to_sequences(train_reviews)
x_test = tokenizer.texts_to_sequences(test_reviews)

# Pad sequences to a maximum review length
max_review_length = 500
x_train = pad_sequences(x_train, maxlen=max_review_length)
x_test = pad_sequences(x_test, maxlen=max_review_length)

# One-hot encode the labels
y_train = to_categorical(train_labels)
y_test = to_categorical(test_labels)

# Save the tokenized and padded sequences
np.save('x_train.npy', x_train)
np.save('x_test.npy', x_test)

# Save the one-hot encoded labels
np.save('y_train.npy', y_train)
np.save('y_test.npy', y_test)

# Save the tokenizer
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)