# Simple NN for text classification

Download IMDB data as it is described [here](https://towardsdatascience.com/sentiment-analysis-with-python-part-1-5ce197074184).

In [1]:
reviews_train = []
for line in open("../data/imdb/full_train.txt", "r"):
    reviews_train.append(line.strip())
    
reviews_test = []
for line in open("../data/imdb/full_test.txt", "r"):
    reviews_test.append(line.strip())

In [2]:
train_target = [1 if i < 12500 else 0 for i in range(25000)]
test_target = [1 if i < 12500 else 0 for i in range(25000)]

In [3]:
print(f"Train size – {len(reviews_train)}")
print(f"Test size – {len(reviews_test)}")

Train size – 25000
Test size – 25000


In [4]:
import torch
import torch.nn as nn

class SentimentClassificationModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.fc1 = nn.Linear(embed_dim, embed_dim)
        self.fc2 = nn.Linear(embed_dim, num_class)
        self.init_weights()

    def init_weights(self):
        init_range = 0.5
        self.embedding.weight.data.uniform_(-init_range, init_range)
        self.fc1.weight.data.uniform_(-init_range, init_range)
        self.fc1.bias.data.zero_()
        self.fc2.weight.data.uniform_(-init_range, init_range)
        self.fc2.bias.data.zero_()

    def forward(self, text):
        embedded = self.embedding(text)
        fc1_output = self.fc1(embedded)
        pooled_output, _ = fc1_output.max(dim=1)
        return self.fc2(pooled_output)

In [5]:
from itertools import chain

train_tokens = list(chain(*[[token for token in sample.lower().split()] for sample in reviews_train]))

In [6]:
from collections import Counter

train_vocabulary = Counter(train_tokens)

In [7]:
train_vocabulary.most_common(10)

[('the', 322198),
 ('a', 159953),
 ('and', 158572),
 ('of', 144462),
 ('to', 133967),
 ('is', 104171),
 ('in', 90527),
 ('i', 70480),
 ('this', 69714),
 ('that', 66292)]

In [8]:
len(train_vocabulary)

251637

In [9]:
UNKNOWN_TOKEN = "unknown"

index_to_token = [UNKNOWN_TOKEN] + list(train_vocabulary.keys())
token_to_index = {token: index + 1 for index, token in enumerate(train_vocabulary.keys())}
token_to_index[UNKNOWN_TOKEN] = 0

In [10]:
token_to_index["hi"]

24615

In [11]:
index_to_token[24615]

'hi'

In [12]:
EMBEDDING_DIM = 100
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 100

### Define batching

In [13]:
def generate_batch(input_data, max_length = MAX_INPUT_LENGTH):
    texts = torch.tensor([padding(sample["text"], max_length) for sample in input_data], dtype=torch.long)
    labels = torch.tensor([sample["label"] for sample in input_data], dtype=torch.long)
    return texts, labels

def padding(text_tokens, max_length, padding_token = 0):
    if len(text_tokens) >= max_length:
        return text_tokens[:max_length]
    return text_tokens + [padding_token]*(max_length - len(text_tokens))

In [14]:
prepared_data = []

for label, text in zip(train_target, reviews_train):
    text_tokens = [token_to_index[token.lower()] for token in text.split()]
    prepared_data.append({"label": label, "text": text_tokens})

### Train and test strategy

In [15]:
from torch.utils.data import DataLoader

def train(input_data):

    # Train the model
    train_loss = 0
    train_acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=generate_batch)
    for i, (text, label) in enumerate(data):
        optimizer.zero_grad()
        text, label = text.to(device), label.to(device)
        output = model(text)
        loss = criterion(output, label)
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
        train_acc += (output.argmax(1) == label).sum().item()

    return train_loss / len(input_data), train_acc / len(input_data)

def test(input_data):
    loss = 0
    acc = 0
    data = DataLoader(input_data, batch_size=BATCH_SIZE, collate_fn=generate_batch)
    for text, label in data:
        text, label = text.to(device), label.to(device)
        with torch.no_grad():
            output = model(text)
            loss = criterion(output, label)
            loss += loss.item()
            acc += (output.argmax(1) == label).sum().item()

    return loss / len(input_data), acc / len(input_data)

## Training

In [16]:
import random
import numpy

def set_seed(seed: int, n_gpu: int):
    random.seed(seed)
    numpy.random.seed(seed)
    torch.manual_seed(seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(seed)

In [17]:
set_seed(42, 1)

In [18]:
device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")

In [19]:
model = SentimentClassificationModel(vocab_size=len(index_to_token), embed_dim=EMBEDDING_DIM, num_class=len(set(train_target))).to(device)

In [20]:
import time
from torch.utils.data.dataset import random_split

N_EPOCHS = 5
min_valid_loss = float('inf')

criterion = torch.nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

train_len = int(len(prepared_data) * 0.95)
train_data, validation_data = \
    random_split(prepared_data, [train_len, len(prepared_data) - train_len])

for epoch in range(N_EPOCHS):

    start_time = time.time()
    train_loss, train_acc = train(train_data)
    valid_loss, valid_acc = test(validation_data)

    secs = int(time.time() - start_time)
    mins = secs / 60
    secs = secs % 60

    print('Epoch: %d' %(epoch + 1), " | time in %d minutes, %d seconds" %(mins, secs))
    print(f'\tLoss: {train_loss:.4f}(train)\t|\tAcc: {train_acc * 100:.1f}%(train)')
    print(f'\tLoss: {valid_loss:.4f}(valid)\t|\tAcc: {valid_acc * 100:.1f}%(valid)')

Epoch: 1  | time in 0 minutes, 1 seconds
	Loss: 0.0470(train)	|	Acc: 52.7%(train)
	Loss: 0.0010(valid)	|	Acc: 66.4%(valid)
Epoch: 2  | time in 0 minutes, 1 seconds
	Loss: 0.0097(train)	|	Acc: 67.4%(train)
	Loss: 0.0014(valid)	|	Acc: 57.0%(valid)
Epoch: 3  | time in 0 minutes, 1 seconds
	Loss: 0.0085(train)	|	Acc: 72.6%(train)
	Loss: 0.0011(valid)	|	Acc: 64.5%(valid)
Epoch: 4  | time in 0 minutes, 1 seconds
	Loss: 0.0076(train)	|	Acc: 76.3%(train)
	Loss: 0.0008(valid)	|	Acc: 73.3%(valid)
Epoch: 5  | time in 0 minutes, 1 seconds
	Loss: 0.0069(train)	|	Acc: 79.1%(train)
	Loss: 0.0006(valid)	|	Acc: 78.0%(valid)


In [22]:
prepared_test_data = []

for label, text in zip(test_target, reviews_test):
    text_tokens = [token_to_index.get(token.lower(), 0) for token in text.split()]
    prepared_test_data.append({"label": label, "text": text_tokens})

In [23]:
test(prepared_test_data)

(tensor(5.3276e-05, device='cuda:1'), 0.75504)