# Question 1 - Creating a Dataset

In this question you'll create a dataset class for the amazon sentiment analysis dataset.

Add the following to the class below:
```__init__```:
1. Enumerate the different labels and create two dict attributes: ```self.label2idx```, ```self.idx2label```.
2. Instantiate a ```TfidfVectorizer``` and use ```TfidfVectorizer.fit_transform``` to transform the sentences into tf-idf vectors. Documentation can be found [here](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html#sklearn.feature_extraction.text.TfidfVectorizer.fit_transform).
3. Set the attribute ```self.vocab_size``` using the tokenizer's ```vocabulary_``` attribute.


```__getitem__```:
1. Reimplement the method so that it returns the tf-idf vector of the sentence in a tensor. the tensor should be of shape ```[vocab_size]``` and not ```[1, vocab_size]```. You can use the ```Tensor.squeeze()``` method to do this ((documentation)[https://pytorch.org/docs/stable/generated/torch.squeeze.html#torch.squeeze])
2. You should return the idx of the label instead of the label itself.
3. The output should be in the following format: ```data = {"input_vectors": setnence, "labels": label}```

In [22]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
class ClassificationDataset(Dataset):

    def __init__(self, file_path, tokenizer=None):
        # Read data
        self.file_path = file_path
        data = pd.read_csv(self.file_path)

        # Split to sentences and labels
        self.sentences = data['reviewText'].tolist()
        self.labels = data['label'].tolist()

        # Enumerate labels
        self.label_to_idx = {tag: idx for idx, tag in enumerate(sorted(list(set(self.labels))))}
        self.idx_to_label = {idx: tag for tag, idx in self.label_to_idx.items()}

        # Tokenize sentences
        if tokenizer is not None:
            self.tokenizer = tokenizer
            self.tokenized_sen = self.tokenizer.transform(self.sentences)
        else:
            self.tokenizer = TfidfVectorizer(lowercase=True, stop_words=None)
            self.tokenized_sen = self.tokenizer.fit_transform(self.sentences)
        
        # Set vocab_size attribute
        self.vocab_size = len(self.tokenizer.vocabulary_)

    def __getitem__(self, item):
        # Tensorize sentence
        sentence = self.tokenized_sen[item]
        sentence = torch.FloatTensor(sentence.toarray()).squeeze()

        # Get label idx
        label = self.labels[item]
        label = self.label_to_idx[label]

        data = {"input_vectors": sentence, "labels": label}
        print('get_item')
        return data

    def __len__(self):
        return len(self.sentences)

In [24]:
train_dataset = ClassificationDataset('./amazon_sa/train.csv')
test_dataset = ClassificationDataset('./amazon_sa/test.csv', tokenizer=train_dataset.tokenizer)

batch_size = 4
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_dataset, shuffle=False, batch_size=batch_size)

In [25]:
train_dataset[0]

get_item


{'input_vectors': tensor([0., 0., 0.,  ..., 0., 0., 0.]), 'labels': 1}

In [32]:
next(iter(train_loader))

get_item
get_item
get_item
get_item


{'input_vectors': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]),
 'labels': tensor([1, 0, 1, 0])}

In [None]:
# the input to the model is a batch
# the batch is a dict with the keys:
# input_vectors,
# labels

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])

In [31]:
for batch in train_loader:
    break
print(batch)

get_item
get_item
get_item
get_item
{'input_vectors': tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]]), 'labels': tensor([0, 1, 1, 1])}


In [52]:
len(batch['input_vectors'][0])

49221

In [50]:
batch['labels']

tensor([0, 1, 1, 1])

In [53]:
train_dataset.vocab_size

49221

In [54]:
inputs, labels = batch

In [59]:
batch['labels']

tensor([0, 1, 1, 1])

# Question 2 - Modeling

In this question you will implement a simple neural network that will classify a sentence given its tf-idf vector.

Implement a model with the following architecture:
1. A linear layer from ```vocab_size``` to ```hidden_dim```.
2. A ReLU activation fn.
3. A linear layer from ```hidden_dim``` to ```num_classes```.
4. A cross Entropy Loss

```forward```:
1. If labels are passed, should return the output of the second layer and the loss.
2. Otherwise, should pass the output of the second layer and None.

In [14]:
from torch import nn

In [15]:
class TfIdfClassifier(nn.Module):

    def __init__(self, vocab_size, num_classes, hidden_dim=100):
        super(TfIdfClassifier, self).__init__()
        self.first_layer = nn.Linear(vocab_size, hidden_dim)
        self.activation = nn.ReLU()
        self.second_layer = nn.Linear(hidden_dim, num_classes)
        self.loss = nn.CrossEntropyLoss()

    def forward(self, input_vectors, labels=None):
        print('forward1')
        x = self.first_layer(input_vectors)
        print('forward2')
        x = self.activation(x)
        print('forward3')
        x = self.second_layer(x)
        print('forward4')
        if labels is None:
            return x, None
        loss = self.loss(x, labels)
        return x, loss

In [34]:
model = TfIdfClassifier(train_dataset.vocab_size, len(train_dataset.label_to_idx), hidden_dim=100)
print(model)

TfIdfClassifier(
  (first_layer): Linear(in_features=49221, out_features=100, bias=True)
  (activation): ReLU()
  (second_layer): Linear(in_features=100, out_features=2, bias=True)
  (loss): CrossEntropyLoss()
)


In [57]:
batch['labels']

tensor([0, 1, 1, 1])

In [78]:
output = model(**batch)

forward1
forward2
forward3
forward4


In [82]:
output[1]

tensor(0.7426, grad_fn=<NllLossBackward0>)

In [71]:
type(output[0])

torch.Tensor

In [45]:
type(batch)

dict

In [46]:
batch.keys()

dict_keys(['input_vectors', 'labels'])

In [37]:
# run the model

In [38]:
net = TfIdfClassifier(train_dataset.vocab_size, 2)

In [43]:
import torch.optim as optim

# using Binary cross entorpy since we have only 2 classes (positive/negative)
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)


In [83]:
criterion(predictions[0].squeeze(1), data_batch['labels'].float())

ValueError: Target size (torch.Size([4])) must be the same as input size (torch.Size([4, 2]))

In [84]:
predictions

(tensor([[-0.0732,  0.0837],
         [-0.0756,  0.0839],
         [-0.0735,  0.0843],
         [-0.0736,  0.0842]], grad_fn=<AddmmBackward0>),
 tensor(0.6173, grad_fn=<NllLossBackward0>))

In [95]:
torch.argmax(predictions[0], dim=1)

tensor([1, 1, 1, 1])

In [106]:
predictions[0][:][1] #torch.argmax(predictions[0], dim=1)]

tensor([-0.0756,  0.0839], grad_fn=<SelectBackward0>)

In [80]:
for epoch in range(2):  # loop over the dataset multiple times

    running_loss = 0.0
    for i, data_batch in enumerate(train_loader):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data_batch

        # zero the parameter gradients
        optimizer.zero_grad()

        print('before net')
        # forward + backward + optimize
        predictions = net(**data_batch)
        print('after net')

        print(data_batch['labels'].float())
        torch.argmax(a)
        loss = criterion(predictions[0].squeeze(1), data_batch['labels'].float())
        print('after loss')
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0

print('Finished Training')

get_item
get_item
get_item
get_item
before net
forward1
forward2
forward3
forward4
after net
tensor([1., 1., 1., 1.])


ValueError: Target size (torch.Size([4])) must be the same as input size (torch.Size([4, 2]))