In [31]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.feature_extraction.text import CountVectorizer
from tqdm import tqdm, tqdm_notebook

In [33]:
torch.cuda.get_device_name(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
DATA_PATH = "./imdb_reviews.csv"

In [15]:
df = pd.read_csv(DATA_PATH)
sentiment = {"negative": 0, "positive": 1}
df['sentiment'] = [sentiment[item] for item in df['sentiment']]
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [20]:
class Sequences(Dataset):
    def __init__(self, df):
        self.vectorizer = CountVectorizer(stop_words='english',
                                          max_df=0.99,
                                          min_df=0.005)
        self.sequences = self.vectorizer.fit_transform(df.review.tolist())
        self.labels = df.sentiment.tolist()
        self.token2idx = self.vectorizer.vocabulary_
        self.idx2token = {idx: token for token, idx in self.token2idx.items()}
            
    def __getitem__(self, i):
        return self.sequences[i, :].toarray(), self.labels[i]
    
    def __len__(self):
        return self.sequences.shape[0]

In [40]:
dataset = Sequences(df)
train_loader = DataLoader(dataset, batch_size=4096)

print(dataset[5][0].shape)

(1, 2891)


In [65]:
class BagOfWordsClassifier(nn.Module):
    def __init__(self, vocab_size, hidden1, hidden2):
        super(BagOfWordsClassifier, self).__init__()
        self.fc1 = nn.Linear(vocab_size, hidden1)
        self.fc2 = nn.Linear(hidden1, hidden2)
        self.fc3 = nn.Linear(hidden2, 1)
        
    def forward(self, inputs):
        x = F.relu(self.fc1(inputs.squeeze(1).float()))
        x = F.relu(self.fc2(x))
        return self.fc3(x)

In [66]:
model = BagOfWordsClassifier(len(dataset.token2idx), 128, 64).to(device)
print(model)

BagOfWordsClassifier(
  (fc1): Linear(in_features=2891, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (fc3): Linear(in_features=64, out_features=1, bias=True)
)


In [67]:
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad], lr=0.001)

In [68]:
model.train()
train_losses = []
for epoch in range(10):
    progress_bar = tqdm_notebook(train_loader, leave=False)
    losses = []
    total = 0
    
    for inputs, target in progress_bar:
        inputs, target = inputs.to(device), target.to(device)
        
        model.zero_grad()
        
        output = model(inputs)
        loss = criterion(output.squeeze(), target.float())
        
        loss.backward()
        
        nn.utils.clip_grad_norm(model.parameters(), 3)
        
        optimizer.step()
        
        progress_bar.set_description(f'Loss: {loss.item():.3f}')
        
        losses.append(loss.item())
        
        total += 1
        
    epoch_loss = sum(losses) / total
    train_losses.append(epoch_loss)
    
    tqdm.write(f'Epoch #{epoch + 1}\tTrain Loss: {epoch_loss:.3f}')

  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #1	Train Loss: 0.623


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #2	Train Loss: 0.392


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #3	Train Loss: 0.285


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #4	Train Loss: 0.257


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #5	Train Loss: 0.245


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #6	Train Loss: 0.238


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #7	Train Loss: 0.232


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #8	Train Loss: 0.225


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #9	Train Loss: 0.218


  0%|          | 0/13 [00:00<?, ?it/s]

Epoch #10	Train Loss: 0.208


In [51]:
test_text = "Yikes, I didn't like the burger at all!"

In [59]:
def predict_sentiment(text):
    model.eval()
    with torch.no_grad():
        test_vector = torch.LongTensor(dataset.vectorizer.transform([text]).toarray()).to(device)
        
        output = model(test_vector)
        prediction = torch.sigmoid(output).item()
        
        if prediction > 0.5:
            print(f'{prediction:0.3}: Positive sentiment')
        else:
            print(f'{prediction:0.3}: Negative sentiment')

In [60]:
predict_sentiment(test_text)

0.429: Negative sentiment
