# Add Frameworks and Libraries

In [0]:
# Import libraries
import os
import pandas as pd
import pickle
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import json
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_sample_weight
from multiprocessing import Pool
from nltk.tokenize import word_tokenize
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
from google.colab import drive

# Setup Environments

In [2]:
# nltk, plt setup
nltk.download('punkt')
%matplotlib inline

# drive setup
drive.mount('/content/drive')
dataset_url = "/content/drive/My Drive/NCKUDMPH2/task1/dataset/"
program_url = "/content/drive/My Drive/NCKUDMPH2/task1/program/"

# setup gpu
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


# Load Abstract Dataset

In [0]:
abstract = """With the emergence of mobile and wearable devices, push notification becomes a powerful tool to connect and maintain the relationship with app users, but sending inappropriate or too many messages at the wrong time may result in the app being removed by the users. In order to maintain the retention rate and the delivery rate of advertisement, we adopt deep neural network (DNN) to develop a pop-up recommendation system “Click-sequence-aware deeP neural network (DNN)-based Pop-uPs recOmmendation (C-3PO)” enabled by collaborative filtering-based hybrid user behavioral analysis. We further verified the system with real data collected from the product security master, clean master, and CM browser, supported by Leopard Mobile Inc. (Cheetah Mobile Taiwan Agency). In this way, we can know precisely about users’ preference and frequency to click on the push notification/pop-ups, decrease the troublesome to users efficiently, and meanwhile increase the click-through rate of push notifications/pop-ups."""

sent_text = pd.Series(nltk.sent_tokenize(abstract))
testData = pd.DataFrame(sent_text, columns=['Abstract'])

# Setup Hyperparameters

In [0]:
embedding_dim = 300
hidden_dim = 512
learning_rate = 1e-4
max_epoch = 15
batch_size = 64

# Helpers

## Tokenize Helpers

In [0]:
# tokenize the words
def collect_words(data_path, n_workers=4):
    df = pd.read_csv(data_path, dtype=str)
        
    sent_list = []
    for i in df.iterrows():
        sent_list += i[1]['Abstract'].split('$$$')

    chunks = [
        ' '.join(sent_list[i:i + len(sent_list) // n_workers])
        for i in range(0, len(sent_list), len(sent_list) // n_workers)
    ]
    with Pool(n_workers) as pool:
        chunks = pool.map_async(word_tokenize, chunks)
        words = set(sum(chunks.get(), []))

    return words

## Data Formatting Helpers

In [0]:
def label_to_onehot(labels):
    label_dict = {'BACKGROUND': 0, 'OBJECTIVES':1, 'METHODS':2, 'RESULTS':3, 'CONCLUSIONS':4, 'OTHERS':5}
    onehot = [0,0,0,0,0,0]
    for l in labels.split('/'):
        onehot[label_dict[l]] = 1
    return onehot

def sentence_to_indices(sentence, word_dict):
    return [word_dict.get(word,UNK_TOKEN) for word in word_tokenize(sentence)]

def get_dataset(data_path, word_dict, n_workers=4):
    dataset = testData

    results = [None] * n_workers
    with Pool(processes=n_workers) as pool:
        for i in range(n_workers):
            batch_start = (len(dataset) // n_workers) * i
            if i == n_workers - 1:
                batch_end = len(dataset)
            else:
                batch_end = (len(dataset) // n_workers) * (i + 1)
            
            batch = dataset[batch_start: batch_end]
            results[i] = pool.apply_async(preprocess_samples, args=(batch,word_dict))

        pool.close()
        pool.join()

    processed = []
    for result in results:
        processed += result.get()
    return processed

def preprocess_samples(dataset, word_dict):
    processed = []
    for sample in tqdm(dataset.iterrows(), total=len(dataset)):
        processed.append(preprocess_sample(sample[1], word_dict))

    return processed

def preprocess_sample(data, word_dict):
    processed = {}
    processed['Abstract'] = [sentence_to_indices(sent, word_dict) for sent in data['Abstract'].split('$$$')]
    if 'Task 1' in data:
        processed['Label'] = [label_to_onehot(label) for label in data['Task 1'].split(' ')]
        
    return processed

## Data Packing Helpers

In [0]:
# Data Packing
class AbstractDataset(Dataset):
    def __init__(self, data, pad_idx, max_len = 500):
        self.data = data
        self.pad_idx = pad_idx
        self.max_len = max_len
    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        return self.data[index]
        
    def collate_fn(self, datas):
        # get max length in this batch
        max_sent = max([len(data['Abstract']) for data in datas])
        max_len = max([min(len(sentence), self.max_len) for data in datas for sentence in data['Abstract']])
        batch_abstract = []
        batch_label = []
        sent_len = []
        for data in datas:
            # padding abstract to make them in same length
            pad_abstract = []
            for sentence in data['Abstract']:
                if len(sentence) > max_len:
                    pad_abstract.append(sentence[:max_len])
                else:
                    pad_abstract.append(sentence+[self.pad_idx]*(max_len-len(sentence)))
            sent_len.append(len(pad_abstract))
            pad_abstract.extend([[self.pad_idx]*max_len]*(max_sent-len(pad_abstract)))
            batch_abstract.append(pad_abstract)
            # gather labels
            if 'Label' in data:
                pad_label = data['Label']
                pad_label.extend([[0]*6]*(max_sent-len(pad_label)))
                
                batch_label.append(pad_label)
        return torch.LongTensor(batch_abstract), torch.FloatTensor(batch_label), sent_len

## Score Helpers

In [0]:
# Score methods
class F1():
    def __init__(self):
        self.threshold = 0.4
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0
        self.name = 'F1'

    def reset(self):
        self.n_precision = 0
        self.n_recall = 0
        self.n_corrects = 0

    def update(self, predicts, groundTruth):
        predicts = predicts > self.threshold
        self.n_precision += torch.sum(predicts).data.item()
        self.n_recall += torch.sum(groundTruth).data.item()
        self.n_corrects += torch.sum(groundTruth.type(torch.uint8) * predicts).data.item()

    def get_score(self):
        recall = self.n_corrects / self.n_recall
        precision = self.n_corrects / (self.n_precision + 1e-20) #prevent divided by zero
        return 2 * (recall * precision) / (recall + precision + 1e-20)

    def print_score(self):
        score = self.get_score()
        return '{:.5f}'.format(score)

## Train Helpers

In [0]:
# Train helpers
def _run_epoch(epoch, training):
    model.train(training)
    if training:
        description = 'Train'
        dataset = trainData
        shuffle = True
    else:
        description = 'Valid'
        dataset = validData
        shuffle = False

    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, collate_fn=dataset.collate_fn, num_workers=8)

    trange = tqdm(enumerate(dataloader), total=len(dataloader), desc=description)
    loss = 0
    f1_score = F1()
    for i, (x, y, sent_len) in trange:
        o_labels, batch_loss = _run_iter(x,y)
        if training:
            opt.zero_grad()
            batch_loss.backward()
            opt.step()

        loss += batch_loss.item()
        f1_score.update(o_labels.cpu(), y)

        trange.set_postfix(
            loss=loss / (i + 1), f1=f1_score.print_score())
        del o_labels, batch_loss # Flush memory

    if training:
        history['train'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
    else:
        history['valid'].append({'f1':f1_score.get_score(), 'loss':loss/ len(trange)})
    
    

def _run_iter(x,y):
    abstract = x.to(device)
    labels = y.to(device)
    o_labels = model(abstract)
    l_loss = criteria(o_labels, labels)
    l_loss = (l_loss * sample_weights).mean()
    del abstract, labels  # flush
    return o_labels, l_loss

def save(epoch):
    if not os.path.exists(program_url+'model'):
        os.makedirs(program_url+'model')
    torch.save(model.state_dict(), program_url+'model/model.pkl.'+str(epoch))
    with open(program_url+'model/history.json', 'w') as f:
        json.dump(history, f, indent=4)

# Data-Preprocessing

In [0]:
# Create a word dictionary
PAD_TOKEN = 0
UNK_TOKEN = 1

if os.path.exists(program_url+'dicitonary.pkl'):
    with open(program_url+'dicitonary.pkl','rb') as f:
        word_dict = pickle.load(f)
else:
    words = set()
    words |= collect_words(dataset_url+'trainset.csv')

    word_dict = {'<pad>':PAD_TOKEN,'<unk>':UNK_TOKEN}
    for word in words:
        word_dict[word]=len(word_dict)

    with open(program_url+'dicitonary.pkl','wb') as f:
        pickle.dump(word_dict, f)

In [12]:
print('[INFO] Start processing testset...')
test = get_dataset(dataset_url+'testset.csv', word_dict, n_workers=8)

[INFO] Start processing testset...


0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
0it [00:00, ?it/s]
100%|██████████| 4/4 [00:00<00:00, 828.87it/s]


In [0]:
testData = AbstractDataset(test, PAD_TOKEN, max_len = 128)

# Create GloVe Embedding vectors

In [0]:
glove_name = 'glove.840B.300d'

if os.path.exists(program_url+'embedding_matrix_{}'.format(glove_name)):
    with open(program_url+'embedding_matrix_{}'.format(glove_name),'rb') as f:
        embedding_matrix = pickle.load(f)
else:
    # Parse the unzipped file (a .txt file) to build an index that maps 
    # words (as strings) to their vector representation (as number vectors)
    wordvector_path = dataset_url+'glove/{}.txt'.format(glove_name)
    embeddings_index = {}
    f = open(wordvector_path, encoding='utf8')
    for line in f:
        values = line.split()
        word = ''.join(values[:-300])
        coefs = np.asarray(values[-300:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embeddings_index))

    # Preparing the GloVe word-embeddings matrix
    max_words = len(word_dict)
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_dict.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

    embedding_matrix = torch.FloatTensor(embedding_matrix)
    with open(program_url+'embedding_matrix_{}'.format(glove_name),'wb') as f:
        pickle.dump(embedding_matrix, f)

# Create Nets

In [0]:
class GloVeNet(nn.Module):
    def __init__(self, vocabulary_size):
        super(GloVeNet, self).__init__()
        self.embedding_size = embedding_dim
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocabulary_size, self.embedding_size)
        self.embedding.weight = torch.nn.Parameter(embedding_matrix)
        self.word_rnn = nn.GRU(self.embedding_size,
                                self.hidden_dim,
                                bidirectional=True,
                                batch_first=True)
        self.sent_rnn = nn.GRU(self.hidden_dim*2, self.hidden_dim, bidirectional=True, batch_first=True)
        
        # 1024 => 512
        self.l1 = nn.Linear(self.hidden_dim*2, self.hidden_dim)
        torch.nn.init.kaiming_normal_(self.l1.weight)
        # 512 => 6
        self.l2 = nn.Linear(self.hidden_dim, 6)
    

    def forward(self, x):
        # input 32, 11, 64
        x = self.embedding(x)  # emb 32, 11, 64, 300
        b,s,w,e = x.shape
        x = x.view(b,s*w,e)  # sent*word 32, 704, 300
        x, __ = self.word_rnn(x)  # rnn 32, 704, 1024
        x = x.view(b,s,w,-1)  # unwrap sw 32, 11, 64, 1024
        # 32 batch, 11 sentence, each 64 words
        x = torch.max(x,dim=2)[0]  # 32, 11, 1024
        x, __ = self.sent_rnn(x)  # 32, 11, 1024
        x = torch.relu(self.l1(x))  # 32, 11, 512
        x = torch.sigmoid(self.l2(x)) # 32, 11, 6
        return x

# Start Predicting

In [16]:
model = GloVeNet(len(word_dict))
opt = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
criteria = torch.nn.BCELoss(reduction='none')
model.to(device)

best_model = 709652
model.load_state_dict(state_dict=torch.load(os.path.join(program_url,'model/model.pkl.{}'.format(best_model))))

# Use trained model to predict
model.train(False)
dataloader = DataLoader(dataset=testData,
                            batch_size=64,
                            shuffle=False,
                            collate_fn=testData.collate_fn,
                            num_workers=8)
trange = tqdm(enumerate(dataloader), total=len(dataloader), desc='Predict')
prediction = []
for i, (x, y, sent_len) in trange:
    o_labels = model(x.to(device))
    result = o_labels>0.4
    for idx, o_label in enumerate(result):
        prediction.append(o_label[:sent_len[idx]].to('cpu'))
prediction = torch.cat(prediction).detach().numpy().astype(int)

Predict: 100%|██████████| 1/1 [00:00<00:00,  7.67it/s]


## Submit Prediction Results

In [18]:
prediction

array([[1, 0, 0, 0, 0, 0],
       [0, 1, 1, 0, 0, 0],
       [0, 1, 0, 1, 0, 0],
       [0, 1, 0, 1, 0, 0]])

In [23]:
sent_text

0    With the emergence of mobile and wearable devi...
1    In order to maintain the retention rate and th...
2    We further verified the system with real data ...
3    In this way, we can know precisely about users...
dtype: object