## Data preprocessing for IMBD dataset


In this step, we will download IMDB dataset, put in a common dataframe (pos and neg reviews), shuffle it, and put it to S3 bucket, with custom kms key, in txt format

In [None]:
import pandas as pd
import os
from IPython.display import clear_output

Setting up paths for upload to s3 for test, train dataset, and setting up kms key.

In [None]:
s3_path_train = "s3://yourbucket/train.txt"
s3_path_test = "s3://yourbucket/test.txt"
input_path = "./aclImdb"
output_path = "./data"
kms_key = "yourkmskey"

Downloading the dataset and untar-ing it

In [None]:
! curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz

In [None]:
! tar -xzf aclImdb_v1.tar.gz

Below a helper class, which will to through train/test data, pos/neg review folder, will open them and create a big test/train files, and save them to txt (data will be also shuffled before saving)

In [None]:
class Preprocess:
    def __init__(self, path, output_path):
        self.path = path
        self.stage = ["train", "test"]
        self.labels = ["pos", "neg"]
        self.output_path = output_path
        if not os.path.exists(os.path.join(os.getcwd(), self.output_path)):
                             os.mkdir(os.path.join(os.getcwd(), self.output_path)) 
        
    def process_txt(self, stage, label):
        paths = os.listdir(os.path.join(self.path, stage,label))
        with open(os.path.join(self.output_path,"{0}_{1}.txt".format(stage, label)), mode="a", encoding="utf-8") as output:
            for path in paths:
                with open(os.path.join(self.path,stage, label, path), mode="rb") as f:
                    output.write(f.read().decode("utf-8")+"\n")
        return True   
    def create(self):
        for stage in self.stage:
            datasets = []
            for label in self.labels:
                print("{0}/{1}".format(stage, label))
                self.process_txt(stage, label)
                data = pd.read_csv(os.path.join(self.output_path,"{0}_{1}.txt".format(stage, label)), sep="\n", header=None, names=["text"])
                data["label"] = 1 if label == "pos" else 0
                datasets.append(data)
            datasets = pd.concat(datasets, axis=0)
            datasets = datasets.sample(frac=1).reset_index(drop=True)
            datasets.to_csv(os.path.join(self.output_path,"{0}.txt".format(stage)), index=False, header=False)
            print("Dataset {0} created".format(stage))
        return True

In [None]:
Preprocess(path=input_path, output_path = output_path).create()

Putting data to s3

In [None]:
! aws s3 cp $(pwd)/$output_path/train.txt $s3_path_train --sse aws:kms --sse-kms-key-id $kms_key

In [None]:
! aws s3 cp $(pwd)/$output_path/test.txt $s3_path_test --sse aws:kms --sse-kms-key-id $kms_key