Nome: Vinicius Freitas Schiavinato Olzon


Matrícula: 20210026803


- O objetivo deste exercício é treinar e avaliar uma rede neural recorrente (RNN
ou LSTM) para classificar um conjunto de dados sobre notícias de acordo com
suas categorias.
- A rede neural pode ser criada utilizando o PyTorch ou Tensorflow/Keras. A rede
neural deve ser avaliada através do cálculo da acurácia (quantidade de acertos
dividido pela quantidade total de testes).

### Carrega as bibliotecas necessárias para puxar o dataset

In [352]:
%pip install torch torchvision torchaudio
%pip install ipywidgets
%pip install torchtext
%pip install datasets

Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49

In [353]:
!pip install ipywidgets widgetsnbextension pandas-profiling



In [354]:
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user
# !jupyter nbextension enable varInspector/main

In [355]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [356]:
import torch
import datasets
import pandas as pd
from torch import nn

import matplotlib.pyplot as plt
from ipywidgets import FloatProgress
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator

In [357]:
dataset = datasets.load_dataset("okite97/news-data")

In [358]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'Category'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'Category'],
        num_rows: 828
    })
})

### Tokenização

In [359]:
tokenizer = get_tokenizer('basic_english')

In [360]:
tokenizer(dataset['train']["Excerpt"][0])

['uefa',
 'has',
 'opened',
 'disciplinary',
 'proceedings',
 'against',
 'barcelona',
 ',',
 'juventus',
 'and',
 'real',
 'madrid',
 'over',
 'their',
 'involvement',
 'in',
 'the',
 'proposed',
 'european',
 'super',
 'league',
 '.']

In [361]:
tokens_list_train = []
for i in range(len(dataset['train']['Excerpt'])):
    tokens_list_train.append(tokenizer(dataset['train']['Excerpt'][i]))

In [362]:
tokens_list_test = []
for i in range(len(dataset['test']['Excerpt'])):
    tokens_list_test.append(tokenizer(dataset['test']['Excerpt'][i]))

### Adiciona as colunas novas de tokens nos dados de treino e teste

In [363]:
dataset['train'] = dataset['train'].add_column(name='tokens', column=tokens_list_train)

In [364]:
dataset['test'] = dataset['test'].add_column(name='tokens', column=tokens_list_test)

In [365]:
dataset['train']['tokens'][0]

['uefa',
 'has',
 'opened',
 'disciplinary',
 'proceedings',
 'against',
 'barcelona',
 ',',
 'juventus',
 'and',
 'real',
 'madrid',
 'over',
 'their',
 'involvement',
 'in',
 'the',
 'proposed',
 'european',
 'super',
 'league',
 '.']

In [366]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens'],
        num_rows: 828
    })
})

### Vocabulário

In [367]:
vocab_custom = build_vocab_from_iterator(dataset['train']['tokens'], specials=[''])
vocab_custom.set_default_index(vocab_custom[''])

### Tokens desconhecidos ficarão como índice 0

In [368]:
# vocab_custom = build_vocab_from_iterator(dataset['train']['tokens'], min_freq=3, specials=['<unk>'])
# vocab_custom.set_default_index(vocab_custom['<unk>'])

In [369]:
vocab_custom['european']

178

In [370]:
vocab_custom['']

0

In [371]:
vocab_custom['cleber']

0

In [372]:
len(vocab_custom)

11729

In [373]:
def vocab(tokens):
  result = []
  for token in tokens:
    result.append(vocab_custom[token])

  return result

In [374]:
vocab(['that', 'football', 'player', 'is', 'going', 'to', 'be', 'the', 'mvp'])

[18, 139, 549, 25, 797, 4, 44, 1, 9371]

In [375]:
vocab(['that', 'football', 'player', 'is', 'going', 'to', 'be', 'the', 'best'])

[18, 139, 549, 25, 797, 4, 44, 1, 509]

### Alterando 'Category' para valores numéricos

In [376]:
categoria_numerico_treino = pd.factorize(dataset['train']['Category'])[0]
categoria_numerico_teste = pd.factorize(dataset['test']['Category'])[0]

In [377]:
print(dataset['train']['Category'][:6])

['sports', 'business', 'politics', 'health', 'politics', 'sports']


In [378]:
print(categoria_numerico_treino[:6])

[0 1 2 3 2 0]


In [379]:
print(dataset['test']['Category'][:6])

['politics', 'politics', 'business', 'health', 'business', 'politics']


In [380]:
print(categoria_numerico_teste[:6])

[0 0 1 2 1 0]


In [381]:
dataset['train'] = dataset['train'].add_column(name="Category_id", column=categoria_numerico_treino)
dataset['test'] = dataset['test'].add_column(name="Category_id", column=categoria_numerico_teste)

In [382]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens', 'Category_id'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens', 'Category_id'],
        num_rows: 828
    })
})

In [383]:
dataset['train']['Category'][:6]

['sports', 'business', 'politics', 'health', 'politics', 'sports']

### Token to ID

In [384]:
def token2id(row):
    row['tokens_id'] = vocab(row['tokens'])
    return row

In [385]:
dataset = dataset.map(token2id)

In [386]:
print(dataset['train']['tokens'][0])

['uefa', 'has', 'opened', 'disciplinary', 'proceedings', 'against', 'barcelona', ',', 'juventus', 'and', 'real', 'madrid', 'over', 'their', 'involvement', 'in', 'the', 'proposed', 'european', 'super', 'league', '.']


In [387]:
print(dataset['train']['tokens_id'][0])

[577, 7, 755, 4675, 3008, 64, 229, 2, 465, 10, 210, 196, 41, 36, 3633, 6, 1, 572, 178, 123, 50, 9]


In [388]:
print(dataset['train']['Category_id'])

[0, 1, 2, 3, 2, 0, 2, 2, 2, 0, 2, 0, 1, 0, 4, 1, 1, 1, 2, 3, 1, 0, 0, 0, 2, 1, 0, 5, 1, 2, 5, 2, 2, 2, 4, 3, 2, 5, 2, 1, 2, 1, 0, 5, 2, 2, 0, 0, 2, 1, 1, 2, 2, 0, 1, 0, 4, 5, 0, 4, 1, 1, 3, 1, 1, 1, 1, 1, 2, 0, 5, 1, 1, 3, 3, 0, 1, 0, 5, 1, 5, 1, 2, 1, 1, 2, 2, 4, 0, 5, 0, 1, 1, 1, 4, 0, 2, 0, 3, 2, 1, 3, 0, 5, 5, 1, 2, 0, 1, 2, 2, 1, 4, 4, 2, 1, 3, 1, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 1, 0, 1, 1, 2, 3, 0, 2, 0, 3, 1, 0, 2, 3, 4, 3, 1, 1, 0, 3, 1, 2, 0, 3, 2, 1, 2, 2, 0, 0, 3, 2, 1, 0, 0, 3, 3, 5, 2, 0, 3, 1, 1, 1, 2, 3, 3, 1, 1, 2, 0, 0, 2, 1, 3, 1, 1, 1, 1, 4, 3, 5, 4, 1, 0, 3, 1, 5, 0, 3, 1, 4, 0, 1, 2, 3, 2, 3, 3, 0, 5, 1, 3, 3, 1, 4, 0, 1, 1, 1, 2, 1, 1, 4, 1, 1, 0, 0, 2, 0, 1, 0, 0, 1, 3, 1, 2, 1, 3, 0, 1, 2, 0, 0, 3, 1, 4, 4, 0, 0, 0, 2, 4, 1, 2, 1, 2, 3, 0, 2, 1, 1, 1, 5, 3, 2, 2, 3, 0, 1, 2, 2, 2, 0, 0, 1, 1, 2, 1, 0, 5, 1, 1, 2, 5, 3, 0, 2, 2, 0, 0, 0, 0, 1, 2, 2, 1, 5, 0, 0, 4, 3, 3, 0, 1, 3, 2, 2, 1, 0, 2, 0, 5, 3, 1, 0, 3, 0, 0, 0, 0, 0, 3, 3, 3, 0, 3, 3, 0, 0, 2, 1, 5, 2, 1, 

In [389]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens', 'Category_id', 'tokens_id'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'Category', 'tokens', 'Category_id', 'tokens_id'],
        num_rows: 828
    })
})

In [390]:
dataset['train'].shape

(4686, 6)

### Dataloader

In [391]:
class NEWSDataset(Dataset):
  def __init__(self, dataset):
    self.data = dataset['tokens_id']
    self.labels = dataset['Category_id']

  def __len__(self):
    return len(self.data)

  def __getitem__(self, x):
    return torch.tensor(self.data[x]), torch.tensor(self.labels[x])

In [392]:
dataset_news = NEWSDataset(dataset['train'])

In [393]:
dataset_news[0][0].shape, dataset_news[0][1].shape

(torch.Size([22]), torch.Size([]))

In [394]:
embed_len = 32
hidden_dim = 16
n_layers = 1
n_classes = len(list(set(dataset['train']['Category'])))

class NewsClassifier(nn.Module):
  def __init__(self):
    super(NewsClassifier, self).__init__()

    ## embedding 4686x32
    self.embedding_layer = nn.Embedding(num_embeddings=len(vocab_custom),embedding_dim=embed_len)
    ## rnn 32x16
    self.rnn = nn.RNN(input_size=embed_len, hidden_size=hidden_dim, num_layers=n_layers)
    ## linear 16x6
    self.linear = nn.Linear(hidden_dim, n_classes)
    self.softmax = nn.Softmax(dim=0)

  def forward(self, input):
    embeddings = self.embedding_layer(input)
    final_hidden = self.rnn(embeddings)
    return self.softmax(self.linear(final_hidden[0]))
  
model = NewsClassifier()

In [395]:
print(dataset['train']['tokens_id'][0])

[577, 7, 755, 4675, 3008, 64, 229, 2, 465, 10, 210, 196, 41, 36, 3633, 6, 1, 572, 178, 123, 50, 9]


In [396]:
len(dataset['train']['tokens_id'][0])

22

In [397]:
model(torch.tensor(dataset['train']['tokens_id'][0]))

tensor([[0.0606, 0.1122, 0.0439, 0.0252, 0.0725, 0.0659],
        [0.0684, 0.0525, 0.0514, 0.0477, 0.0662, 0.0940],
        [0.0330, 0.0373, 0.0470, 0.0713, 0.0297, 0.0469],
        [0.0812, 0.0210, 0.0241, 0.0419, 0.0537, 0.0472],
        [0.0368, 0.0235, 0.0434, 0.0788, 0.0265, 0.0324],
        [0.0687, 0.0540, 0.0419, 0.0234, 0.0464, 0.0817],
        [0.0193, 0.0308, 0.0416, 0.0495, 0.0285, 0.0239],
        [0.0687, 0.0443, 0.0277, 0.0305, 0.0753, 0.0381],
        [0.0363, 0.0462, 0.0586, 0.0543, 0.0504, 0.0445],
        [0.0431, 0.0525, 0.0472, 0.0339, 0.0360, 0.0387],
        [0.0528, 0.0360, 0.0411, 0.0180, 0.0394, 0.0375],
        [0.0359, 0.0592, 0.0523, 0.0277, 0.0435, 0.0366],
        [0.0344, 0.0552, 0.0565, 0.0356, 0.0611, 0.0456],
        [0.0271, 0.0632, 0.0696, 0.0348, 0.0502, 0.0298],
        [0.0249, 0.0445, 0.0618, 0.0406, 0.0311, 0.0325],
        [0.0485, 0.0339, 0.0287, 0.0358, 0.0618, 0.0379],
        [0.0763, 0.0530, 0.0256, 0.0246, 0.0598, 0.0875],
        [0.034

In [398]:
device = ("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

NewsClassifier(
  (embedding_layer): Embedding(11729, 32)
  (rnn): RNN(32, 16)
  (linear): Linear(in_features=16, out_features=6, bias=True)
  (softmax): Softmax(dim=0)
)

In [399]:
LR = 1e-3
EPOCHS = 5
loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

In [400]:
from tqdm.notebook import tqdm

In [401]:
def train(dataloader):
    size = len(dataloader.dataset)
    train_loss = 0

    for batch, (data, label) in enumerate(dataloader):
        data, label = data.to(device), label.to(device)
        optimizer.zero_grad()
        out = model(data[0])
        label_tensor = torch.tensor(label).to(device).squeeze()

        loss = loss_fn(out, label_tensor) 

        loss.backward()
        optimizer.step()

        train_loss += loss.item()

        if batch % 1000 == 0:
            current = batch * len(data)
            print(f' loss: {loss.item():>7f}  [{current:>5d}/{size:>5d}]')

    train_loss /= size
    return train_loss

In [402]:
dataloader_train = DataLoader(NEWSDataset(dataset['train']), shuffle=True)


In [403]:
def test(dataloader):
    size = len(dataloader.dataset)
    test_loss = 0
    correct = 0

    with torch.no_grad():
        for data, label in tqdm(dataloader):
            data, label = data.to(device), label.to(device)
            out = model(data[0])
            label_tensor = torch.tensor(label).to(device).squeeze()

            loss = loss_fn(out, label_tensor)

            test_loss += loss.item()
            _, predicted = torch.max(out, 0)
            correct += (predicted == label_tensor).sum().item()

    test_loss /= size
    accuracy = correct / size

    print(f' test accuracy: {(100 * accuracy):>0.1f}%, test loss: {test_loss:>8f} ')

    return test_loss

In [404]:
dataloader_test = DataLoader(NEWSDataset(dataset['test']), shuffle=True)

In [None]:
train_losses = []
test_losses = []
for t in range(EPOCHS):
    print(f'Epoch {t+1} -------------------------------')

    print('Train')
    train_losses.append(train(dataloader_train))

    print('Test')
    test_losses.append(test(dataloader_test))

In [None]:
plt.plot(train_losses, label='Train Loss')
plt.plot(test_losses, label='Test Loss')
plt.legend()
plt.show()