## 1. Importing necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import re
import string
import tensorflow_datasets as tfds
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.feature_extraction.text import TfidfTransformer
from keras.preprocessing.text import text_to_word_sequence
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

## 2. Import Preprocessor module

### Basic cleaning and preprocessing

In [2]:
# Function to clean and tokenize
def clean_text(text, tokenizer, stopwords):
    """Pre-process text and generate tokens
    
    Args:
        text: Text to tokenize.

    Returns:
        Tokenized text.
    """
    text = str(text).lower()                            # Lowercase words
    text = re.sub(r"\[(.*?)\]", "", text)               # Remove [+XYZ chars] in content
    text = re.sub(r"\s+", " ", text)                    # Remove multiple spaces in content
    text = re.sub(r"\w+…|…", "", text)                  # Remove ellipsis (and last word)
    text = re.sub(r"(?<=\w)-(?=\w)", " ", text)         # Replace dash between words
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove punctuation

    tokens = tokenizer(text)                                            # Get tokens from text
    tokens = [t for t in tokens if not t in stopwords]                  # Remove stopwords
    tokens = ["" if t.isdigit() else t for t in tokens]                 # Remove digits
    tokens = [t for t in tokens if len(t) > 1]                          # Remove short tokens
    return tokens

### Writing to csv file

In [3]:
def file_write(X_train, X_test, y_train, y_test):
    train_data = pd.DataFrame(list(zip(X_train, y_train)))
    train_data.to_csv('../data/TrainingSet.csv')
    
    test_data = pd.DataFrame(list(zip(X_test, y_test)))
    test_data.to_csv('../data/TestSet.csv')

## 3. Implementation

**IMDb Reviews** is a large dataset for binary sentiment classification, consisting of 50,000 highly polar reviews (in English) with an even number of examples for training and testing purposes.

The dataset contains additional unlabelled data. A negative review has a score ≤ 4 out of 10, and a positive review has a score ≥ 7 out of 10. No more than 30 reviews are included per movie.

In [4]:
dataset = tfds.load('imdb_reviews', as_supervised=False)

train_dat = pd.DataFrame(dataset['train'])
test_dat = pd.DataFrame(dataset['test'])

X_train, y_train = pd.DataFrame([str(s.numpy()) for s in train_dat['text']], columns=['text']), train_dat['label']
X_test, y_test = pd.DataFrame([str(s.numpy()) for s in test_dat['text']], columns=['text']), test_dat['label']

2022-07-10 13:00:34.212364: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
X_train_tmp = X_train

In [6]:
X_train_tmp

Unnamed: 0,text
0,"b""This was an absolutely terrible movie. Don't..."
1,b'I have been known to fall asleep during film...
2,b'Mann photographs the Alberta Rocky Mountains...
3,b'This is the kind of film for a snowy Sunday ...
4,"b'As others have mentioned, all the women that..."
...,...
24995,"b'I have a severe problem with this show, seve..."
24996,"b'The year is 1964. Ernesto ""Che"" Guevara, hav..."
24997,b'Okay. So I just got back. Before I start my ...
24998,b'When I saw this trailer on TV I was surprise...


In [7]:
stop_words = set(stopwords.words("english"))

X_train['text'] = X_train['text'].map(lambda x:clean_text(x, word_tokenize, stop_words))
X_test['text'] = X_test['text'].map(lambda x:clean_text(x, word_tokenize, stop_words))

In [8]:
X_train

Unnamed: 0,text
0,"[bthis, absolutely, terrible, movie, dont, lur..."
1,"[bi, known, fall, asleep, films, usually, due,..."
2,"[bmann, photographs, alberta, rocky, mountains..."
3,"[bthis, kind, film, snowy, sunday, afternoon, ..."
4,"[bas, others, mentioned, women, go, nude, film..."
...,...
24995,"[bi, severe, problem, show, several, actually,..."
24996,"[bthe, year, ernesto, che, guevara, cuban, cit..."
24997,"[bokay, got, back, start, review, let, tell, o..."
24998,"[bwhen, saw, trailer, tv, surprised, may, six,..."


In [10]:
y_train

0        tf.Tensor(0, shape=(), dtype=int64)
1        tf.Tensor(0, shape=(), dtype=int64)
2        tf.Tensor(0, shape=(), dtype=int64)
3        tf.Tensor(1, shape=(), dtype=int64)
4        tf.Tensor(1, shape=(), dtype=int64)
                        ...                 
24995    tf.Tensor(0, shape=(), dtype=int64)
24996    tf.Tensor(1, shape=(), dtype=int64)
24997    tf.Tensor(0, shape=(), dtype=int64)
24998    tf.Tensor(0, shape=(), dtype=int64)
24999    tf.Tensor(1, shape=(), dtype=int64)
Name: label, Length: 25000, dtype: object

In [12]:
print(list(zip(X_train, y_train)))

[('text', <tf.Tensor: shape=(), dtype=int64, numpy=0>)]


In [9]:
# exporting resultant datasets
file_write(X_train, X_test, y_train, y_test)