<a href="https://colab.research.google.com/github/tuli-pen/NLP/blob/master/sentiment_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sentiment Analysis

### Dataset Dair-ai Emotion

Tuli Peña y Francisco Gutiérrez

In [None]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m3.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.2 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (19

In [None]:
from datasets import load_dataset

In [None]:
# Cargar el dataset
dataset = load_dataset("dair-ai/emotion")

# Ver los datos
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})


In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from collections import Counter


dataset = load_dataset("dair-ai/emotion")


def build_vocab(texts):
    tokens = [word for text in texts for word in text.split()]
    vocab = Counter(tokens)
    vocab = {word: i for i, (word, _) in enumerate(vocab.items())}
    vocab['<unk>'] = len(vocab)
    return vocab


def tokenize(texts, vocab):
    return [[vocab.get(word, vocab.get('<unk>')) for word in text.split()] for text in texts]

class EmotionDataset(Dataset):
    def __init__(self, texts, labels, vocab):
        self.texts = tokenize(texts, vocab)
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return {
            'text': self.texts[idx],
            'label': self.labels[idx]
        }

def pad_sequences(sequences, max_len):
    return [seq + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

def collate_fn(batch):
    texts = [item['text'] for item in batch]
    labels = [item['label'] for item in batch]
    max_len = max(len(text) for text in texts)
    texts = pad_sequences(texts, max_len)
    texts_tensor = torch.tensor(texts, dtype=torch.long)
    labels_tensor = torch.tensor(labels, dtype=torch.long)
    return {'text': texts_tensor, 'label': labels_tensor}

def preprocess_data(dataset):
    train_texts, val_texts, train_labels, val_labels = train_test_split(
        dataset['text'], dataset['label'], test_size=0.2, random_state=42
    )
    vocab = build_vocab(train_texts)
    train_dataset = EmotionDataset(train_texts, train_labels, vocab)
    val_dataset = EmotionDataset(val_texts, val_labels, vocab)
    return train_dataset, val_dataset, vocab

class SimpleNN(nn.Module):
    def __init__(self, vocab_size, embed_size, num_classes):
        super(SimpleNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.fc1 = nn.Linear(embed_size, 128)
        self.fc2 = nn.Linear(128, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.embedding(x)
        x = x.mean(dim=1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

def train_model(train_loader, model, criterion, optimizer, num_epochs=5):
    for epoch in range(num_epochs):
        model.train()
        for batch in train_loader:
            texts = batch['text']
            labels = batch['label']


            outputs = model(texts)
            loss = criterion(outputs, labels)


            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch + 1}/{num_epochs}, Loss: {loss.item()}')

def evaluate_model(val_loader, model):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in val_loader:
            texts = batch['text']
            labels = batch['label']
            outputs = model(texts)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)

            correct += (predicted == labels).sum().item()

    accuracy = 100 * correct / total
    print(f'La precicion del modelo es: {accuracy:.2f}%')

train_dataset, val_dataset, vocab = preprocess_data(dataset['train'])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)


vocab_size = len(vocab) + 1
embed_size = 100
num_classes = len(set(dataset['train']['label']))


model = SimpleNN(vocab_size, embed_size, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

train_model(train_loader, model, criterion, optimizer)

evaluate_model(val_loader, model)



Epoch 1/5, Loss: 1.3864412307739258
Epoch 2/5, Loss: 1.635778784751892
Epoch 3/5, Loss: 1.4418880939483643
Epoch 4/5, Loss: 0.7727164626121521
Epoch 5/5, Loss: 0.821401059627533
La precicion del modelo es: 70.56%
