In [1]:
import pandas as pd
import numpy as np
import spacy
import thinc.extra.datasets
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
train, test = thinc.extra.datasets.imdb()

Downloading data from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz


In [10]:
!python -m spacy download en_core_web_md

Collecting en_core_web_md==2.3.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-2.3.0/en_core_web_md-2.3.0.tar.gz (50.8 MB)
[K     |████████████████████████████████| 50.8 MB 937 kB/s eta 0:00:01     |██████████████████████▎         | 35.3 MB 1.2 MB/s eta 0:00:14     |██████████████████████▌         | 35.6 MB 1.2 MB/s eta 0:00:13
Building wheels for collected packages: en-core-web-md
  Building wheel for en-core-web-md (setup.py) ... [?25ldone
[?25h  Created wheel for en-core-web-md: filename=en_core_web_md-2.3.0-py3-none-any.whl size=50921514 sha256=f91a1193e7be0cd42ddbec262aad1c752456f317e0e2aca31bd30fc3da0588d8
  Stored in directory: /tmp/pip-ephem-wheel-cache-mtl8x5wt/wheels/a9/30/7d/40a0d13f1ddae5b6398c9f407391942152348eb9eae62fa21e
Successfully built en-core-web-md
Installing collected packages: en-core-web-md
Successfully installed en-core-web-md-2.3.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via s

# TF-IDF based

In [3]:
def spacy_tokenizer(sentence):
    import spacy
    nlp = spacy.load('en_core_web_md')
    mytokens = nlp(sentence)
    mytokens = [ word.lemma_ for word in mytokens if not word.is_punct and not word.is_stop]
    return mytokens

In [70]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [75]:
df_train = pd.DataFrame(train, columns=['x', 'y'])
df_test = pd.DataFrame(test, columns=['x', 'y'])

In [72]:
pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer(tokenizer=spacy_tokenizer)),
                           ('clf', LogisticRegression())
                          ])

In [79]:
# 20-30 minutes
pipeline.fit(df_train['x'], df_train['y'])

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='...\w+\\b',
                                 tokenizer=<function spacy_tokenizer at 0x7fd855b045f0>,
                                 use_idf=True, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_i

In [81]:
pd.to_pickle(pipeline, './tf_idf_logistic.pkl')

In [83]:
# 20 mins
y_pred_test = pipeline.predict_proba(df_test['x'])

In [84]:
sklearn.metrics.roc_auc_score(df_test['y'], y_pred_test[:, 1])

0.9485079168000001

In [86]:
df_test['y'].sum()

12500

# embedding based

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
from tensorboardX import SummaryWriter
import random

In [5]:
TEXT = data.Field(tokenize = 'spacy', tokenizer_language='en_core_web_md', lower=True, include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [6]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split()

downloading aclImdb_v1.tar.gz


aclImdb_v1.tar.gz: 100%|██████████| 84.1M/84.1M [00:26<00:00, 3.22MB/s]


In [7]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, 
                 max_size=MAX_VOCAB_SIZE,
                 vectors='glove.6B.300d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

.vector_cache/glove.6B.zip: 862MB [07:05, 2.03MB/s]                               
100%|█████████▉| 399226/400000 [00:34<00:00, 10642.58it/s]

In [8]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

cuda


In [9]:
class ImdbRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers,
              is_bidirectional=False, dropout=0.0, output_dim=1, padding_idx=None):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim, 
                                      padding_idx=padding_idx)

        self.lstm = nn.LSTM(embedding_dim, hidden_dim, 
                            num_layers=n_layers, bidirectional=is_bidirectional,
                           dropout=dropout)

        self.fc = nn.Linear((is_bidirectional+1)*hidden_dim, output_dim)

        self.is_bidirectional = is_bidirectional
    
    def forward(self, input_sequence, sequence_length):
    
        embeddings = self.embedding(input_sequence)

        packed_embeddings = nn.utils.rnn.pack_padded_sequence(embeddings, 
                                                            sequence_length)

        packed_output, (hidden_state, cell_state) = self.lstm(packed_embeddings)

        if self.is_bidirectional:
            output = torch.cat((hidden_state[-2,:,:], hidden_state[-1,:,:]), dim = 1)
        else:
            output = hidden_state[-1,:,:]
        scores = self.fc(output)

        return scores

In [14]:
vocab_size = len(TEXT.vocab)
embedding_dim = 300 # This needs to match the size of the pre-trained embeddings!
hidden_dim = 256
num_layers = 3
dropout = 0.5
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
model = ImdbRNN(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim=hidden_dim,
                  n_layers=num_layers,  dropout=dropout,
                  padding_idx=pad_idx)

In [15]:
# Initialize word embeddings
glove_vectors = TEXT.vocab.vectors
model.embedding.weight.data.copy_(glove_vectors)
# Zero out <unk> and <pad> tokens
unk_idx = TEXT.vocab.stoi[TEXT.unk_token]
model.embedding.weight.data[unk_idx] = torch.zeros(embedding_dim)
model.embedding.weight.data[pad_idx] = torch.zeros(embedding_dim)
# Define our loss function, optimizer, and move things to GPU
criterion = nn.BCEWithLogitsLoss()
model = model.to(device)
criterion = criterion.to(device)
optimizer = optim.Adam(model.parameters())

In [16]:
import sklearn.metrics
def roc(scores, y):    
    scores = torch.sigmoid(scores).detach().cpu()
    return sklearn.metrics.roc_auc_score(y.cpu(), scores)

def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for batch in iterator:
        
        optimizer.zero_grad()
        
        text, text_lengths = batch.text
        
        predictions = model(text, text_lengths).squeeze(1)
        
        loss = criterion(predictions, batch.label)
        
        acc = roc(predictions, batch.label)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
    return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
    
        for batch in iterator:
            text, text_lengths = batch.text
            
            predictions = model(text, text_lengths).squeeze(1)
            
            loss = criterion(predictions, batch.label)
            
            acc = roc(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
      
    
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [17]:
summary_writer = SummaryWriter(log_dir=f"tf_log/")
num_epochs = 5
best_valid_loss = float('inf')
for epoch in range(num_epochs):    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
    
    # Log the training results
    summary_writer.add_scalar("training/accuracy", train_acc, epoch)
    summary_writer.add_scalar("training/loss", train_loss, epoch)
    
    # Log the validation results
    summary_writer.add_scalar("validation/accuracy", valid_acc, epoch)
    summary_writer.add_scalar("validation/loss", valid_loss, epoch)
    
    print(f'Epoch {epoch+1}') 
    print(f'train_loss : {train_loss} val_loss : {valid_loss}')
    print(f'train_accuracy : {train_acc*100} val_accuracy : {valid_acc*100}')
    if valid_loss <= best_valid_loss:
        torch.save(model.state_dict(), './state_dict.pt')
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(best_valid_loss,
                                                                                        valid_loss))
        best_valid_loss = min(best_valid_loss, valid_loss)
    print(25*'==')
 
   
# After completing all epochs, visualize our word vectors
vecs = model.embedding.weight.data
labels = [l.encode('utf8') for l in TEXT.vocab.itos]
summary_writer.add_embedding(vecs, 
                             metadata=labels)
summary_writer.close()
# Print test performance
test_loss, test_accuracy = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f}\nTest Acc: {test_accuracy*100:.2f}%')

Epoch 1
train_loss : 0.6623532661121257 val_loss : 0.6884514497498334
train_accuracy : 63.10034960354696 val_accuracy : 66.15630441180073
Validation loss decreased (inf --> 0.688451).  Saving model ...
Epoch 2
train_loss : 0.6889677321823844 val_loss : 0.69243919142222
train_accuracy : 55.66974941201753 val_accuracy : 75.55368059955272
Epoch 3
train_loss : 0.5672216991873553 val_loss : 0.45616439136408143
train_accuracy : 81.30125140256432 val_accuracy : 90.91957832496959
Validation loss decreased (0.688451 --> 0.456164).  Saving model ...
Epoch 4
train_loss : 0.30173119065100257 val_loss : 0.29135100189912116
train_accuracy : 95.51699222123537 val_accuracy : 95.15840968623243
Validation loss decreased (0.456164 --> 0.291351).  Saving model ...
Epoch 5
train_loss : 0.15816742665793773 val_loss : 0.32047024268214985
train_accuracy : 98.42993224657101 val_accuracy : 95.70143335059566
Test Loss: 0.376
Test Acc: 94.47%


# use wrapper from textattact

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchtext import data
from torchtext import datasets
from tensorboardX import SummaryWriter
import random

In [2]:
TEXT = data.Field(tokenize = 'spacy', 
                  tokenizer_language='en_core_web_md', 
                  lower=True, 
                  include_lengths = True)
LABEL = data.LabelField(dtype = torch.float)

In [3]:
train_data, test_data = datasets.IMDB.splits(TEXT, LABEL)
train_data, valid_data = train_data.split()

In [4]:
MAX_VOCAB_SIZE = 25000
TEXT.build_vocab(train_data, 
                 max_size=MAX_VOCAB_SIZE,
                 vectors='glove.6B.300d',
                 unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

In [5]:
BATCH_SIZE = 128
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_iterator, valid_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE,
    sort_within_batch = True,
    device = device)

cuda


In [6]:
from textattack.models.helpers import LSTMForClassification

In [7]:
import thinc.extra.datasets
train, test = thinc.extra.datasets.imdb()

In [8]:
from textattack.commands.train_model.run_training import train_model

In [9]:
class Args:
    def __init__(self):
        
        self.output_dir = './ta_model/'
        self.enable_wandb = False
        self.allowed_labels = []
        self.batch_size = 128 
        self.grad_accum_steps = 1
        self.num_train_epochs = 5
        self.learning_rate = 0.001
        self.warmup_proportion = 0.1
        self.max_length = 256
        self.weights_name = 'pytorch_model.bin'
        self.config_name = 'config.json'
        self.tb_writer_step = 1000
        self.checkpoint_steps = -1
        self.checkpoint_every_epoch = True
        self.early_stopping_epochs = -1
        
        

In [10]:
args = Args()

In [None]:
model = LSTMForClassification(hidden_size=256, depth=3, dropout=0.5, max_seq_length=256)

In [None]:
vocab_size = len(TEXT.vocab)
embedding_dim = 300 # This needs to match the size of the pre-trained embeddings!
hidden_dim = 256
num_layers = 3
dropout = 0.5
pad_idx = TEXT.vocab.stoi[TEXT.pad_token]
model = ImdbRNN(vocab_size=vocab_size, embedding_dim=embedding_dim,hidden_dim=hidden_dim,
                  n_layers=num_layers,  dropout=dropout,
                  padding_idx=pad_idx)