Nome: Vinicius Freitas Schiavinato Olzon


Matrícula: 20210026803


- O objetivo deste exercício é treinar e avaliar uma rede neural recorrente (RNN
ou LSTM) para classificar um conjunto de dados sobre notícias de acordo com
suas categorias.
- A rede neural pode ser criada utilizando o PyTorch ou Tensorflow/Keras. A rede
neural deve ser avaliada através do cálculo da acurácia (quantidade de acertos
dividido pela quantidade total de testes).

### Carrega as bibliotecas necessárias para puxar o dataset

In [242]:
# %pip install torch torchvision torchaudio
# %pip install datasets
# %pip install torchtext

In [243]:
import torch
import datasets
from torch.utils.data import Dataset

In [244]:
dataset = datasets.load_dataset("okite97/news-data")

In [245]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'Category'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'Category'],
        num_rows: 828
    })
})

### Tokenização

In [246]:
from torchtext.data.utils import get_tokenizer
tokenizer = get_tokenizer('basic_english')

In [247]:
tokenizer(dataset['train']["Excerpt"][0])

['uefa',
 'has',
 'opened',
 'disciplinary',
 'proceedings',
 'against',
 'barcelona',
 ',',
 'juventus',
 'and',
 'real',
 'madrid',
 'over',
 'their',
 'involvement',
 'in',
 'the',
 'proposed',
 'european',
 'super',
 'league',
 '.']

In [248]:
tokens_list_train = []
for i in range(len(dataset['train']['Excerpt'])):
    tokens_list_train.append(tokenizer(dataset['train']['Excerpt'][i]))

In [249]:
tokens_list_test = []
for i in range(len(dataset['test']['Excerpt'])):
    tokens_list_test.append(tokenizer(dataset['test']['Excerpt'][i]))

### Adiciona as colunas novas de tokens nos dados de treino e teste

In [250]:
dataset['train'] = dataset['train'].add_column(name='tokens', column=tokens_list_train)

In [251]:
dataset['test'] = dataset['test'].add_column(name='tokens', column=tokens_list_test)

In [252]:
dataset['train']['tokens'][0]

['uefa',
 'has',
 'opened',
 'disciplinary',
 'proceedings',
 'against',
 'barcelona',
 ',',
 'juventus',
 'and',
 'real',
 'madrid',
 'over',
 'their',
 'involvement',
 'in',
 'the',
 'proposed',
 'european',
 'super',
 'league',
 '.']

### Vocabulário

In [253]:
from torchtext.vocab import build_vocab_from_iterator

vocab_custom = build_vocab_from_iterator(dataset['train']['tokens'],
                                         min_freq=3,
                                         specials=['<unk>'])

### Tokens desconhecidos ficarão como índice 0

In [254]:
vocab_custom.set_default_index(vocab_custom['<unk>'])

In [255]:
vocab_custom['european']

178

In [256]:
vocab_custom['<unk>']

0

In [257]:
vocab_custom['cleber']

0

In [258]:
len(vocab_custom)

4137

In [259]:
def vocab(tokens):
  result = []
  for token in tokens:
    result.append(vocab_custom[token])

  return result

In [260]:
vocab(['that', 'football', 'player', 'is', 'going', 'to', 'be', 'the', 'mvp'])

[18, 139, 549, 25, 797, 4, 44, 1, 0]

In [261]:
vocab(['that', 'football', 'player', 'is', 'going', 'to', 'be', 'the', 'best'])

[18, 139, 549, 25, 797, 4, 44, 1, 509]

### Alterando 'Category' para valores numéricos

In [262]:
import pandas as pd
categoria_numerico_treino = pd.factorize(dataset['train']['Category'])[0]
categoria_numerico_teste = pd.factorize(dataset['test']['Category'])[0]

  categoria_numerico_treino = pd.factorize(dataset['train']['Category'])[0]
  categoria_numerico_teste = pd.factorize(dataset['test']['Category'])[0]


In [263]:
print(dataset['train']['Category'][:6])

['sports', 'business', 'politics', 'health', 'politics', 'sports']


In [264]:
print(categoria_numerico_treino[:6])

[0 1 2 3 2 0]


In [265]:
print(dataset['test']['Category'][:6])

['politics', 'politics', 'business', 'health', 'business', 'politics']


In [266]:
print(categoria_numerico_teste[:6])

[0 0 1 2 1 0]


In [267]:
dataset = dataset.remove_columns("Category")

In [268]:
dataset['train'] = dataset['train'].add_column(name="Category", column=categoria_numerico_treino)
dataset['test'] = dataset['test'].add_column(name="Category", column=categoria_numerico_teste)

In [269]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category'],
        num_rows: 828
    })
})

In [270]:
dataset['train']['Category'][:6]

[0, 1, 2, 3, 2, 0]

### Token to ID

In [271]:
def token2id(row):
    row['tokens_id'] = vocab(row['tokens'])
    return row

In [272]:
dataset = dataset.map(token2id)

Map: 100%|██████████| 4686/4686 [00:00<00:00, 8568.27 examples/s]
Map: 100%|██████████| 828/828 [00:00<00:00, 9841.54 examples/s]


In [273]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category', 'tokens_id'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category', 'tokens_id'],
        num_rows: 828
    })
})

In [274]:
print(dataset['train']['tokens'][0])

['uefa', 'has', 'opened', 'disciplinary', 'proceedings', 'against', 'barcelona', ',', 'juventus', 'and', 'real', 'madrid', 'over', 'their', 'involvement', 'in', 'the', 'proposed', 'european', 'super', 'league', '.']


In [275]:
print(dataset['train']['tokens_id'][0])

[577, 7, 755, 0, 3008, 64, 229, 2, 465, 10, 210, 196, 41, 36, 3633, 6, 1, 572, 178, 123, 50, 9]


In [276]:
class NEWSDataset(Dataset):
  def __init__(self, dataset):
    self.data = dataset['tokens_id']
    self.labels = dataset['Category']

  def __len__(self):
    return len(self.data)

  def __getitem__(self, x):
    return torch.tensor(self.data[x]), torch.tensor(self.labels[x])

In [277]:
dataset_news = NEWSDataset(dataset['train'])

In [278]:
len(dataset_news)

4686

In [279]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category', 'tokens_id'],
        num_rows: 4686
    })
    test: Dataset({
        features: ['Title', 'Excerpt', 'tokens', 'Category', 'tokens_id'],
        num_rows: 828
    })
})

In [280]:
dataset_news[0][0].shape

torch.Size([22])

In [281]:
dataset_news[0][1].shape

torch.Size([])

In [282]:
from torch.utils.data import DataLoader

dataloader = DataLoader(NEWSDataset(dataset['train']), shuffle=True)