In [0]:
# run in colab

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd

reviews_df = pd.read_csv("/content/drive/My Drive/kaggle_reviews.csv", encoding="cp1251")
reviews_df.sample(5)

Unnamed: 0.1,Unnamed: 0,type,review,label,file
47917,47917,train,I must admit a slight disappointment with this...,pos,8126_7.txt
84089,84089,train,This is quite possibly one of the worst movies...,unsup,40681_0.txt
7061,7061,test,"I was prepared for a bad movie, and a bad movi...",neg,5105_3.txt
52098,52098,train,I gave this a 2 instead of a 1 only because I ...,unsup,1188_0.txt
394,394,test,There is something about this show that keeps ...,neg,10355_4.txt


In [0]:
import re
from string import punctuation

def clean_text(text):
    text = re.sub(r"[^a-zA-Z0-9"+punctuation+" ]", "", text)
    return text

reviews_df["review"] = reviews_df["review"].apply(clean_text)

In [0]:
unsup_df = reviews_df.loc[reviews_df["label"] == "unsup"]
reviews_df = reviews_df.loc[reviews_df["label"] != "unsup"]

test_df = reviews_df.loc[reviews_df["type"] == "test"]
train_df = reviews_df.loc[reviews_df["type"] == "train"]
train_df = train_df.sample(frac=1).reset_index(drop=True)
val_frac = 0.8
val_df = train_df[int(len(train_df)*val_frac):]
train_df = train_df[:int(len(train_df)*val_frac)]

train_df.to_csv('_train.csv', index=False, encoding="utf-8")
test_df.to_csv('_test.csv', index=False, encoding="utf-8")
val_df.to_csv('_val.csv', index=False, encoding="utf-8")

In [0]:
import spacy

nlp = spacy.load('en')

def tokenizer(text): # create a tokenizer function
    text_list = []
    for word in nlp(text):
        l = word.lemma_
        t = word.text
        if l == "-PRON-":
            w = t.lower()
        else:
            w = l.lower()
        if w and not w.startswith(" ") and not w.endswith(" "):
          if " " in w:
            text_list += w.split(" ")
          else:
            text_list.append(w)
    return text_list

In [0]:
unsup_sents = [tokenizer(review) for review in unsup_df["review"]]

In [0]:
import gensim

model = gensim.models.Word2Vec(unsup_sents, size=300, window=7, min_count=3, workers=4)
model.wv.save_word2vec_format("model.w2v")

In [0]:
from torchtext.vocab import Vectors

vectors = Vectors(name="model.w2v", cache=".")

In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torchtext.data import Field, LabelField, TabularDataset, BucketIterator

classes = {
    'neg': 0,
    'pos': 1
}

TEXT = Field(include_lengths=True, batch_first=True, 
             tokenize=tokenizer,
             eos_token='<eos>',
             lower=True)

LABEL = LabelField(dtype=torch.float, use_vocab=True, preprocessing=lambda x: classes[x])

In [0]:
pd.read_csv("_train.csv")

Unnamed: 0.1,Unnamed: 0,type,review,label,file
0,45223,train,"John Holmes is so famous, he's infamous (as th...",pos,5701_8.txt
1,29069,train,This movie is yet another in the long line of ...,neg,2412_1.txt
2,36704,train,Considering that I felt like picking up a new ...,neg,9285_3.txt
3,28273,train,I am a great fan of David Lynch and have every...,neg,1697_2.txt
4,42713,train,This movie leaves the intellectual mind thinki...,pos,3442_10.txt
...,...,...,...,...,...
19995,41669,train,This movie surprised me! Not ever having heard...,pos,2502_8.txt
19996,34762,train,"And I thought The Beach was bad, with the diff...",neg,7537_1.txt
19997,36790,train,The plot sounded like it had promise. To be ho...,neg,9362_1.txt
19998,39217,train,"First ever viewing: July 21, 2008<br /><br />V...",pos,11546_9.txt


In [0]:
train_data, val_data, test_data = TabularDataset.splits(
    path='.', train='_train.csv',
    validation='_val.csv', test='_test.csv', format='csv',
    skip_header=True,
    fields=[(None, None), (None, None), ('text', TEXT), ('label', LABEL), (None, None)])

In [0]:
BATCH_SIZE = 64

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, val_data, test_data), 
    batch_sizes=(BATCH_SIZE, BATCH_SIZE, BATCH_SIZE),
    shuffle=True, sort=False, 
    device = device)

In [0]:
TEXT.build_vocab(train_data, min_freq=3, vectors=vectors)
LABEL.build_vocab(train_data)

In [0]:
class NeuralModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        
        self.conv_0 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[0], embedding_dim))
        self.conv_1 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[1], embedding_dim))
        self.conv_2 = nn.Conv2d(in_channels = 1, out_channels = n_filters, kernel_size = (filter_sizes[2], embedding_dim))
        
        self.out = nn.Linear(len(filter_sizes) * n_filters, 1)
        
    def forward(self, text):
        embeds = self.embedding(text)        
        embeds = embeds.unsqueeze(1)
                
        conved_0 = F.relu(self.conv_0(embeds).squeeze(3))
        conved_1 = F.relu(self.conv_1(embeds).squeeze(3))
        conved_2 = F.relu(self.conv_2(embeds).squeeze(3))
                    
        pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
        pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
        pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
                
        cat = torch.cat((pooled_0, pooled_1, pooled_2), dim = 1)
        
        ret = self.out(cat)
        return ret

In [0]:
input_dim = len(TEXT.vocab)
embedding_dim = 300
n_filters = 100
filter_sizes = [3,4,5]
output_dim = 1
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]

model = NeuralModel(input_dim, embedding_dim, n_filters, filter_sizes, output_dim, pad_idx)

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

optimizer = optim.Adam(model.parameters())

criterion = nn.BCEWithLogitsLoss()

model = model.to(device)
criterion = criterion.to(device)

In [0]:
def bin_acc(y_pred, y):
    rounded_pred = torch.round(torch.sigmoid(y_pred))
    correct = (rounded_pred == y).float()
    acc = correct.sum() / len(correct)
    return acc

In [0]:
def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        
        y_preds = model(batch.text[0]).squeeze(1)
        loss = criterion(y_preds, batch.label)
        acc = bin_acc(y_preds, batch.label)
        
        loss.backward()
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.eval()

    with torch.no_grad():
        for batch in iterator:
            y_preds = model(batch.text[0]).squeeze(1)
            loss = criterion(y_preds, batch.label)
            acc = bin_acc(y_preds, batch.label)
            
            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [0]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [0]:
def train_model(model, train_iter, val_iter, optimizer,
                criterion, num_epochs=100, es_epochs=3):
    best_loss = float('inf')
    best_epoch = 0
    epoch_no_improve = 0

    for epoch in range(num_epochs):
        start_time = time.time()
        
        train_loss, train_acc = train(model, train_iter, optimizer, criterion)
        valid_loss, valid_acc = evaluate(model, val_iter, criterion)
        
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        if valid_loss < best_loss:
            best_loss = valid_loss
            best_epoch = epoch + 1
            torch.save(model.state_dict(), 'model.pt')
            epoch_no_improve = 0
        else:
            epoch_no_improve += 1

        print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
        print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
        print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')
        
        if epoch_no_improve >= es_epochs:
            print("\n")
            print("Early stopping!")
            print(f"Best epoch: {best_epoch:02}")
            break

In [0]:
train_model(model, train_iterator, valid_iterator, optimizer, criterion)

Epoch: 01 | Epoch Time: 0m 51s
	Train Loss: 0.435 | Train Acc: 79.47%
	 Val. Loss: 0.346 |  Val. Acc: 84.79%
Epoch: 02 | Epoch Time: 0m 52s
	Train Loss: 0.232 | Train Acc: 91.20%
	 Val. Loss: 0.337 |  Val. Acc: 85.32%
Epoch: 03 | Epoch Time: 0m 51s
	Train Loss: 0.103 | Train Acc: 97.38%
	 Val. Loss: 0.310 |  Val. Acc: 87.06%
Epoch: 04 | Epoch Time: 0m 52s
	Train Loss: 0.031 | Train Acc: 99.81%
	 Val. Loss: 0.312 |  Val. Acc: 87.88%
Epoch: 05 | Epoch Time: 0m 51s
	Train Loss: 0.009 | Train Acc: 100.00%
	 Val. Loss: 0.323 |  Val. Acc: 87.76%
Epoch: 06 | Epoch Time: 0m 52s
	Train Loss: 0.005 | Train Acc: 100.00%
	 Val. Loss: 0.338 |  Val. Acc: 87.99%


Early stopping!
Best epoch: 03


In [0]:
model.load_state_dict(torch.load('model.pt'))

test_loss, test_acc = evaluate(model, test_iterator, criterion)

print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.300 | Test Acc: 87.77%
