## Dataloader
* reference : Natural language processing with PYTORCH
1. load data file into a dataframe
2. change text into right form using tokenizer or corpus
3. change modified text into vectors

In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

In [9]:
import os, re, string

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import notebook

In [6]:
if torch.cuda.device_count()>1:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
elif torch.cuda.device_count()>0:
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [15]:
"""
data shape
rating, review, split
negative, "sentence", train
positive, "sentence", train
"""
f1_s = "/home/bwlee/data/yelp_review_polarity_csv/reviews_with_splits_full.csv"

#df1 = pd.read_csv(f1_s, header=0)
df1 = pd.read_csv(f1_s, header=0, nrows=1000)
#df1 = pd.read_csv(f1_s, header=0, skiprows=lambda x: x%5>0)
label0 = []
for i in df1.index: # change this for getting info based on previous day market move
    if df1['rating'].iloc[i] == 'positive':
        label0.append(1)
    else:
        label0.append(0)
df1['rating'] = label0

In [16]:
df1

Unnamed: 0,rating,review,split
0,0,the entrance was the impressive thing about th...,train
1,0,"i m a mclover , and i had no problem nwith the...",train
2,0,"less than good here , not terrible , but i see...",train
3,0,i don t know if i can ever bring myself to go ...,train
4,0,food was ok good but the service was terrible ...,train
...,...,...,...
995,0,i ve given this place so many chances and it a...,train
996,0,ordered from the ayce menu and some of the sal...,train
997,0,"generally , i love sonic . it is by far my fav...",train
998,0,how does this location stay open ? lines are n...,train


In [24]:
"""
data shape
review
"sentence"
# split and rating are separated by file name
"""
f1_s = "/home/bwlee/work/codes/controllable/data/amazon/sentiment.train.0"
f2_s = "/home/bwlee/work/codes/controllable/data/amazon/sentiment.train.1"
df1 = pd.read_csv(f1_s, sep='\n', header=None, names=['review'])
df1['rating'] = 0
df1['split'] = 'train'

In [25]:
df1

Unnamed: 0,review,rating,split
0,especially on moderate where the attacks are c...,0,train
1,i put this on my hair and flat ironed it and m...,0,train
2,since their price is three times that they are...,0,train
3,just had to give it the personal touch .,0,train
4,i cannot find patterns to solve the problem .,0,train
...,...,...,...
277764,i have a full sized houge grip on my glock num...,0,train
277765,there is nothing about this product which warr...,0,train
277766,the only thing it controlled was my hot flashe...,0,train
277767,there s so much personal taste involved when a...,0,train


## dataset test

In [28]:
class TestDataset(Dataset):
    def __init__(self):
        self.xs = range(1, 100)
        self.ys = [ x*10 for x in self.xs ]
        
    def __len__(self):
        return len(self.xs)
    
    def __getitem__(self, ii):
        x = self.xs[ii]
        y = self.ys[ii]
        return x, y

## Dataset and DataLoader
* Dataset gets data
* map-style, iterable style
    * map-style(Dataset) :  needs __getitem__(), __len__()
    * iterable style(IterableDataset) : needs __iter__()
* DataLoader transform Dataset into batch and tensors
```python
torch.utils.data.Dataset
torch.utils.data.TensorDataset(*tensors)
# *tensors (Tensor) – tensors that have the same size of the first dimension.
```
* TensorDataset can be used for multiple data with a condition for dimension.

In [29]:
testdataset = TestDataset()

In [30]:
dataloader = DataLoader(testdataset, batch_size=10)

In [31]:
for x, y in dataloader:
    print(x, y)

tensor([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10]) tensor([ 10,  20,  30,  40,  50,  60,  70,  80,  90, 100])
tensor([11, 12, 13, 14, 15, 16, 17, 18, 19, 20]) tensor([110, 120, 130, 140, 150, 160, 170, 180, 190, 200])
tensor([21, 22, 23, 24, 25, 26, 27, 28, 29, 30]) tensor([210, 220, 230, 240, 250, 260, 270, 280, 290, 300])
tensor([31, 32, 33, 34, 35, 36, 37, 38, 39, 40]) tensor([310, 320, 330, 340, 350, 360, 370, 380, 390, 400])
tensor([41, 42, 43, 44, 45, 46, 47, 48, 49, 50]) tensor([410, 420, 430, 440, 450, 460, 470, 480, 490, 500])
tensor([51, 52, 53, 54, 55, 56, 57, 58, 59, 60]) tensor([510, 520, 530, 540, 550, 560, 570, 580, 590, 600])
tensor([61, 62, 63, 64, 65, 66, 67, 68, 69, 70]) tensor([610, 620, 630, 640, 650, 660, 670, 680, 690, 700])
tensor([71, 72, 73, 74, 75, 76, 77, 78, 79, 80]) tensor([710, 720, 730, 740, 750, 760, 770, 780, 790, 800])
tensor([81, 82, 83, 84, 85, 86, 87, 88, 89, 90]) tensor([810, 820, 830, 840, 850, 860, 870, 880, 890, 900])
tensor([91, 92, 93, 94, 95, 

### DataLoader
* if it None, batch is not applied
* defau;t batch_size is 1
* batch_sampler works for map-style datasets

## torchtext : DataLoader, Iterator
* There's another dataloader for text in torchtext
* This has various open dataset
* It is still cumbersome in reading from dataframe
* In case, if you read directly from TSV, CSV, JSON, ...  
You can use TabularDataset
* torchtext.data.Dataset requires Example type of data
    * You need to make Example for each data instance  
      and give it as input    

In [38]:
# prepare data
from torchtext import data
from torchtext.data import Dataset, Example
from torchtext.data import Iterator, BucketIterator

df = df1
train_df = df[df['split']=='train'][['rating', 'review']]
val_df = df[df['split']=='val'][['rating', 'review']]
test_df = df[df['split']=='test'][['rating', 'review']]
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

we can use several type of tokenizer.
In here, we use spacy

In [33]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [60]:
# If use_vocab is True, it should have vocabulary 
# when Dataloader is iterating
# It can be generated by calling build_vocab
TEXT = data.Field(sequential=True, use_vocab=True,
                 tokenize='spacy', lower=True,
                 batch_first=True, fix_length=100,
                 init_token='<SOS>', eos_token='<EOS>'
                 )

LABEL = data.Field(sequential=False, use_vocab=False,
                  batch_first=False, is_target=True)

In [61]:
# make data frame small for experiment
train_df0 = train_df[:1000]
fields0 = (('rating', LABEL), ('review', TEXT))
train_data = [ Example.fromlist(
    train_df0.iloc[ii].values.tolist(),
                              fields0)
              for ii in range(len(train_df0)) ]
train_data = Dataset(train_data, fields=fields0)

If dataset is made, we can generate vocabulary.  
If a vocabulary is established, dataloader vectorize  
each sentence.

In [62]:
TEXT.build_vocab(train_data, min_freq=10, max_size=10000)

In [63]:
iter1 = Iterator(train_data, batch_size=10)

In [64]:
batch = next(iter(iter1))

In [47]:
batch.rating

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [65]:
batch.review[:2]

tensor([[  2, 136, 118,   0,  26,   5,   0,  22,   8,  29,  20,  36,  80,   0,
           4,   3,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1],
        [  2,  58,   0,  61,   7,   0,  41,  12,   0, 165, 140,   4,   0,   9,
           6,  17,   0,  93,  20,   0,  26,  13, 140,   4,   3,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,   1,
           1,   1,   1,   1,   1

In [55]:
## check!!!! vocab is not working well

In [55]:
vocab = TEXT.vocab

In [75]:
len(vocab)

4

In [74]:
vocab.itos[58]

IndexError: list index out of range

In [70]:
vocab.stoi['i']

0