<h1>Download datsets: CIFAR-10 and IMDb Datasets</h1>

In [None]:
import torchvision

# Downloading trainset of CIFAR-10 dataset
trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True)
# Downloading testset of CIFAR-10 dataset
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True)


In [None]:
import os
import re
import pandas as pd
import requests
import tarfile
import subprocess

# Load all files from a directory in a DataFrame.
def load_directory_data(directory):
    data = {}
    data["sentence"] = []
    data["sentiment"] = []
    for file_path in os.listdir(directory):
        with open(os.path.join(directory, file_path), "r") as f:
            data["sentence"].append(f.read())
            data["sentiment"].append(re.match("\d+_(\d+)\.txt", file_path).group(1))

    return pd.DataFrame.from_dict(data)

# Merge positive and negative examples, add a polarity column and shuffle.
def load_dataset(directory):
    pos_df = load_directory_data(os.path.join(directory, "pos"))
    neg_df = load_directory_data(os.path.join(directory, "neg"))
    pos_df["polarity"] = 1
    neg_df["polarity"] = 0
    return pd.concat([pos_df, neg_df]).sample(frac=1).reset_index(drop=True)

# Download and process the dataset files.
def download_and_load_datasets(force_download=False):
    
    url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
    fname = "aclImdb.tar.gz"

    dataset = "data/"

    # if downloaded:
    if (os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p')) and 
        os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p')) ):

        print("Dataset has already downloaded, loading catched dataset instead.")
        train_df = pd.read_pickle(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p'))
        test_df = pd.read_pickle(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p'))

    else:
        print("Downloading...")
        response = requests.get(url, stream=True)
        if response.status_code == 200:
            with open(os.path.join(dataset, fname), 'wb') as f:
                f.write(response.raw.read())
        else:
            raise Exception("Downloading error.")
        # untar
        print("Extracting...")
        subprocess.call(["tar", "xzf", os.path.join(dataset, fname), 
                         "-C", dataset])
        train_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                             "aclImdb", "train"))
        test_df = load_dataset(os.path.join(os.path.dirname(dataset),
                                            "aclImdb", "test"))
        if os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p')):
            os.remove(os.path.join(dataset, fname[:fname.find('.')], 'train_df.p'))
        if os.path.exists(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p')):
            os.remove(os.path.join(dataset, fname[:fname.find('.')], 'test_df.p'))
        
        train_df.to_pickle("data/aclImdb/train_df.p")
        test_df.to_pickle("data/aclImdb/test_df.p")
        print("Dataset is downloaded.")
    
    return train_df, test_df

train_df, test_df = download_and_load_datasets()