In [136]:
# config

data_path = '../../../../NLP/data/' # please change to your own path
seq_len = 30
fix_embedding = True # fix embedding during training
batch_size = 128
epoch = 5
lr = 0.001
emb_size = 250

In [137]:
# load data
def load_training_data(path='training_label.txt'):
    if 'training_label' in path:
        with open(data_path+path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            lines = [line.strip('\n').split('+++$+++') for line in lines]
        x = [line[1] for line in lines]
        y = [int(line[0].strip()) for line in lines]
        return x, y
    else:
        with open(data_path+path, 'r', encoding='utf-8') as f:
            lines = f.readlines()
            x = [line.strip('\n') for line in lines]
        return x
def load_testing_data(path='testing_data.txt'):
    # 把testing時需要的data讀進來
    with open(data_path + path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
        X = ["".join(line.strip('\n').split(",")[1:]).strip() for line in lines[1:]]
    return X

In [138]:
# train word2vec
from gensim.models import word2vec
def train_word2vec(x):
    model = word2vec.Word2Vec(x, size=250, window=5, min_count=5, workers=4, iter=10, sg=1)
    return model

# model.save()

In [139]:
# load data
train_x, train_y = load_training_data('training_label.txt')
train_x_no_label = load_training_data('training_nolabel.txt')
test_x = load_testing_data('testing_data.txt')

In [140]:
# replace special character
def preprocess(data):
    '''
    Credit goes to https://www.kaggle.com/gpreda/jigsaw-fast-compact-solution
    '''
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~`" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    def clean_special_chars(text, punct):
        for p in punct:
            text = text.replace(p, ' ')
        return text

    data = [clean_special_chars(s, punct) for s in data]
    return data

In [141]:
from keras.preprocessing import text, sequence
tokenizer = text.Tokenizer()
tokenizer.fit_on_texts(train_x + train_x_no_label + test_x)
# clean punct
train_x = preprocess(train_x)
test_x = preprocess(test_x)
# tokenize sentence and give each word index
train_x = tokenizer.texts_to_sequences(train_x)
test_x = tokenizer.texts_to_sequences(test_x)
# cut sentence by maxlen
train_x = sequence.pad_sequences(train_x, maxlen=seq_len)
test_x = sequence.pad_sequences(test_x, maxlen=seq_len)

In [142]:
## build word embedding matrix
from gensim.models import Word2Vec
import numpy as np
def build_embedding_matrix(word_index, embedding):
    embedding_matrix = np.zeros((len(word_index)+1, emb_size))
    unknow_words = []
    for word, i in word_index.items():
        try:
            embedding_matrix[i] = embedding[word]
        except KeyError:
            unknow_words.append(word)
    return embedding_matrix, unknow_words
embedding = Word2Vec.load(data_path + 'word2vec.model')
embedding_matrix, unknow_words = build_embedding_matrix(tokenizer.word_index, embedding.wv)

In [143]:
# prepare Data loader
import torch
from torch.utils import data

class TwitterDataset(data.Dataset):
    """
    Expected data shape like:(data_num, data_len)
    Data can be a list of numpy array or a list of lists
    input data shape : (data_num, seq_len, feature_dim)
    
    __len__ will return the number of data
    """
    def __init__(self, X, y):
        self.data = X
        self.label = y
    def __getitem__(self, idx):
        if self.label is None: return self.data[idx]
        return self.data[idx], self.label[idx]
    def __len__(self):
        return len(self.data)

In [144]:
# model
import torch
from torch.nn import functional as F
from torch import nn
class TwitterSentimentModel(torch.nn.Module):
    def __init__(self, embedding_matrix, fix_embedding, hidden_dim, num_layers, dropout):
        super(TwitterSentimentModel, self).__init__()
        self.embedding = torch.nn.Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1])
        self.embedding.weight = torch.nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float))
        self.embedding.weight.requires_grad = False if fix_embedding else True
        self.embedding_dim = embedding_matrix.shape[1]
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = torch.nn.LSTM(self.embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.classifier = torch.nn.Sequential( 
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, 1),
            nn.Sigmoid())
    def forward(self, x):
        inputs = self.embedding(x)
        x, _ = self.lstm(inputs, None)
        # x.shape = (batch_size, seq_len, hidden_size)
        # get the last output
        x = x[:, -1, :]
        x = self.classifier(x)
        return x 

LSTM Parameters:
- **input_size** – The number of expected features in the input x
- **hidden_size** – The number of features in the hidden state h
- **num_layers** – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
- **bias** – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
- **batch_first** – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
- **dropout** – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
- **bidirectional** – If True, becomes a bidirectional LSTM. Default: False

Input of LSTM: **input, (h_0, c_0)** `self.lstm(inputs, None)`
- **input of shape (seq_len, batch, input_size)**: tensor containing the features of the input sequence. The input can also be a packed variable length sequence. See torch.nn.utils.rnn.pack_padded_sequence() or torch.nn.utils.rnn.pack_sequence() for details.
- **h_0 of shape (num_layers * num_directions, batch, hidden_size)**: tensor containing the initial hidden state for each element in the batch. If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
- **c_0 of shape (num_layers * num_directions, batch, hidden_size)**: tensor containing the initial cell state for each element in the batch.If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.

Output of LSTM: **output, (h_n, c_n)**


<img src='LSTM1.png' width="500" height="800"></img>
<img src='LSTM2.png' width="500" height="800"></img>

In [145]:
# valid loop
def valid(model, val_loader, criterion):
    total_loss, total_acc = 0., 0.
    v_batch = len(val_loader)
    model.eval()
    with torch.no_grad():
        for i, (x, y) in enumerate(val_loader):
            x = x.to(device, dtype=torch.long)
            y = y.to(device, dtype=torch.float)
            outputs = model(x)
            outputs = outputs.squeeze()
            loss = criterion(outputs, y)
            correct = accuracy(outputs, y, 0.5)
            total_acc += (correct/batch_size)
            total_loss += loss.item()
        print("Valid | Loss:{:.5f} Acc: {:.3f} ".format(total_loss/v_batch, total_acc/v_batch*100))
    return total_loss/v_batch, total_acc/v_batch

In [146]:
import os
import shutil
def save_checkpoint(state, is_best, checkpoint):
    """Saves model and training parameters at checkpoint + 'last.pth.tar'. If is_best==True, also saves
    checkpoint + 'best.pth.tar'

    Args:
        state: (dict) contains model's state_dict, may contain other keys such as epoch, optimizer state_dict
        is_best: (bool) True if it is the best model seen till now
        checkpoint: (string) folder where parameters are to be saved
    """
    filepath = os.path.join(checkpoint, 'last.pth.tar')
    if not os.path.exists(checkpoint):
        print("Checkpoint Directory does not exist! Making directory {}".format(checkpoint))
        os.mkdir(checkpoint)
#     else:
#         print("Checkpoint Directory exists! ")
    torch.save(state, filepath)
    if is_best:
        shutil.copyfile(filepath, os.path.join(checkpoint, 'best.pth.tar'))


def load_checkpoint(checkpoint, model, optimizer=None):
    """Loads model parameters (state_dict) from file_path. If optimizer is provided, loads state_dict of
    optimizer assuming it is present in checkpoint.

    Args:
        checkpoint: (string) filename which needs to be loaded
        model: (torch.nn.Module) model for which the parameters are loaded
        optimizer: (torch.optim) optional: resume optimizer from checkpoint
    """
    if not os.path.exists(checkpoint):
        raise ("File doesn't exist {}".format(checkpoint))
    checkpoint = torch.load(checkpoint)
    model.load_state_dict(checkpoint['state_dict'])

    if optimizer:
        optimizer.load_state_dict(checkpoint['optim_dict'])

    return checkpoint

In [151]:
# train loop
from torch.utils.tensorboard import SummaryWriter
def train(model, train_loader, val_loader, criterion, optimizer, n_epochs, model_dir, restore_file=None):
    if restore_file is not None:
        load_checkpoint(restore_file, model, optimizer) 
    total = sum(p.numel() for p in model.parameters())
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print('\nstart training, parameter total:{}, trainable:{}\n'.format(total, trainable))
    model.train()
    total_loss, total_acc, best_acc = 0, 0, 0
    t_batch = len(train_loader)
    v_batch = len(val_loader)
    show_idx = 50
    writer = SummaryWriter()
    with torch.enable_grad():
        for epoch in range(n_epochs):
            total_loss, total_acc = 0., 0.
            for i, (x, y) in enumerate(train_loader):
                x = x.to(device, dtype=torch.long)
                y = y.to(device, dtype=torch.float)
                optimizer.zero_grad()
                outputs = model(x)
                outputs = outputs.squeeze()
                loss = criterion(outputs, y)
                loss.backward()
                optimizer.step()
                correct = accuracy(outputs, y, 0.5)
                total_acc += (correct/batch_size)
                total_loss += loss.item()
                if (i+1) % show_idx == 0:
                    print('[ Epoch{}: {}/{} ] loss:{:.3f} acc:{:.3f} '.format(
                        epoch+1, i+1, t_batch, loss.item(), correct*100/batch_size), end='\r')
            print('\nTrain | Loss:{:.5f} Acc: {:.3f}'.format(total_loss/t_batch, total_acc/t_batch*100))
            val_loss, val_acc = valid(model, val_loader, criterion)
            if val_acc > best_acc:
                best_acc = val_acc
                print('---find best score---')
                save_checkpoint({'epoch': epoch + 1,
                                'state_dict': model.state_dict(),
                                'optim_dict': optimizer.state_dict()},
                                is_best=True,
                                checkpoint=model_dir)
            print('-----------------------------------------------')
            model.train() # 將model的模式設為train，這樣optimizer就可以更新model的參數（因為剛剛轉成eval模式）
            writer.add_scalar('Loss/train', total_loss/t_batch, epoch)
            writer.add_scalar('Loss/val', val_loss, epoch)
            writer.add_scalar('Accuracy/train', total_acc/t_batch*100, epoch)
            writer.add_scalar('Accuracy/val', val_acc, epoch)

In [148]:
# metrics
def accuracy(outputs, labels, threshold):
    outputs[outputs>=threshold] = 1
    outputs[outputs<threshold] = 0
    correct = torch.sum(torch.eq(outputs, labels)).item()
    return correct

In [149]:
# main
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_train = int(len(train_x)*0.8)
tr_x, val_x = train_x[:num_train], train_x[num_train:]
tr_y, val_y = train_y[:num_train], train_y[num_train:]
# train, val data loader
train_dataset = TwitterDataset(X=tr_x, y=tr_y)
train_loader = torch.utils.data.DataLoader(dataset = train_dataset,
                                    batch_size = batch_size,
                                    shuffle = True)
val_dataset = TwitterDataset(X=val_x, y=val_y)
val_loader = torch.utils.data.DataLoader(dataset = val_dataset,
                                    batch_size = batch_size,
                                    shuffle = False)


model = TwitterSentimentModel(embedding_matrix, fix_embedding=True, hidden_dim=256, num_layers=1, dropout=0.2)
model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.BCELoss()

In [150]:
train(model, train_loader, val_loader, criterion, optimizer, 10, 'model_dir', None)


start training, parameter total:64063199, trainable:520449

[ Epoch1: 1250/1250 ] loss:0.491 acc:75.000 
Train | Loss:0.46112 Acc: 78.278
Valid | Loss:0.43288 Acc: 79.673 
-----------------------------------------------
[ Epoch2: 1250/1250 ] loss:0.379 acc:83.594 
Train | Loss:0.41651 Acc: 80.839
Valid | Loss:0.42033 Acc: 80.696 
-----------------------------------------------
[ Epoch3: 1250/1250 ] loss:0.388 acc:80.469 
Train | Loss:0.39562 Acc: 82.019
Valid | Loss:0.41438 Acc: 80.791 
-----------------------------------------------
[ Epoch4: 1250/1250 ] loss:0.387 acc:81.250 
Train | Loss:0.37823 Acc: 82.895
Valid | Loss:0.40145 Acc: 81.842 
-----------------------------------------------
[ Epoch5: 1250/1250 ] loss:0.378 acc:82.812 
Train | Loss:0.36040 Acc: 83.834
Valid | Loss:0.40268 Acc: 81.849 
-----------------------------------------------
[ Epoch6: 1250/1250 ] loss:0.351 acc:85.938 
Train | Loss:0.33891 Acc: 84.904
Valid | Loss:0.41217 Acc: 81.577 
---------------------------

### Semi Supervised learning

In [152]:
# predict no label data
train_x_no_label = load_training_data('training_nolabel.txt')
train_x_no_label = preprocess(train_x_no_label)
train_x_no_label = tokenizer.texts_to_sequences(train_x_no_label)
train_x_no_label = sequence.pad_sequences(train_x_no_label, maxlen=seq_len)

In [153]:
# add the no label data to train
train_x_no_label_dataset = TwitterDataset(train_x_no_label, None)
train_x_no_label_dataloader = torch.utils.data.DataLoader(dataset = train_x_no_label_dataset,
                                    batch_size = batch_size,
                                    shuffle = False)

In [154]:
# load model
model = TwitterSentimentModel(embedding_matrix, fix_embedding=True, hidden_dim=256, num_layers=1, dropout=0.2)
load_checkpoint('model_dir/best.pth.tar', model, optimizer)
model.to(device)
model.eval()
pred = []
with torch.no_grad():
    for i, x in enumerate(train_x_no_label_dataloader):
        x = x.to(device, dtype=torch.long)
        outputs = model(x)
        outputs = outputs.squeeze()
        pred += outputs.float().tolist()

In [155]:
pred = np.array(pred)

train_x_semi = train_x_no_label[(pred>=0.8) | (pred <=0.2)]
y_semi = (pred[(pred>=0.8) | (pred <=0.2)]>=0.5).astype('int')

new_train_x = np.concatenate([tr_x, train_x_semi])
new_y = np.concatenate([tr_y, y_semi])

new_train_dataset = TwitterDataset(new_train_x, new_y)
new_train_dataloader = torch.utils.data.DataLoader(new_train_dataset, batch_size=batch_size, shuffle=True)

In [157]:
train(model, new_train_dataloader, val_loader, criterion, optimizer, 10, 'semi', 'model_dir/best.pth.tar')


start training, parameter total:64063199, trainable:520449

[ Epoch1: 7100/7129 ] loss:0.134 acc:95.312  
Train | Loss:0.12113 Acc: 97.428
Valid | Loss:0.40268 Acc: 81.849 
---find best score---
-----------------------------------------------
[ Epoch2: 7100/7129 ] loss:0.102 acc:99.219  
Train | Loss:0.12110 Acc: 97.427
Valid | Loss:0.40268 Acc: 81.849 
-----------------------------------------------
[ Epoch3: 7100/7129 ] loss:0.128 acc:96.875  
Train | Loss:0.12110 Acc: 97.414
Valid | Loss:0.40268 Acc: 81.849 
-----------------------------------------------
[ Epoch4: 7100/7129 ] loss:0.128 acc:97.656  
Train | Loss:0.12113 Acc: 97.423
Valid | Loss:0.40268 Acc: 81.849 
-----------------------------------------------
[ Epoch5: 7100/7129 ] loss:0.127 acc:96.094  
Train | Loss:0.12108 Acc: 97.421
Valid | Loss:0.40268 Acc: 81.849 
-----------------------------------------------
[ Epoch6: 7100/7129 ] loss:0.109 acc:98.438  
Train | Loss:0.12118 Acc: 97.421
Valid | Loss:0.40268 Acc: 81.849 