## this notebook is for nlp neural project framwork

#####  the whole system consists of  5 block: 
1. tokenizer
2. embedding
3. dataset
4. train_test
5. arg_parser

the whole process is followed by: parser->tokenizer->embedding->dataset->train_test

### tokenizer

In [10]:
import os
import pickle
import numpy as np

class Tokenizer(object):
    def __init__(self, max_seq_len=0, lower=True,pad_all=False):
        self.lower = lower
        self.pad_all = pad_all
        self.max_seq_len = max_seq_len
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 1

    def fit_on_text(self, text):
        if self.lower:
            text = text.lower()
        words = text.split()
        for word in words:
            if word not in self.word2idx:
                self.word2idx[word] = self.idx
                self.idx2word[self.idx] = word
                self.idx += 1
                
    def pad_and_truncate(self,sequence, maxlen, dtype='int64', padding='post',
                         truncating='post', value=0, pad_all=False):
        x = (np.ones(maxlen) * value).astype(dtype)
        if truncating == 'pre':
            trunc = sequence[-maxlen:]
        else:
            trunc = sequence[:maxlen]
        trunc = np.asarray(trunc, dtype=dtype)
        if padding == 'post':
            x[:len(trunc)] = trunc
        else:
            x[-len(trunc):] = trunc
        if pad_all:
            return x
        else:
            return trunc
    
    def text_to_sequence(self, text, reverse=False, padding='post', truncating='post'):
        if self.lower:
            text = text.lower()
        words = text.split()
        unknownidx = len(self.word2idx)+1
        sequence = [self.word2idx[w] if w in self.word2idx else unknownidx for w in words]
        if len(sequence) == 0:
            sequence = [0]
        if reverse:
            sequence = sequence[::-1]
        if self.max_seq_len !=0:
            return self.pad_and_truncate(sequence, self.max_seq_len,padding=padding
                                         ,truncating=truncating, pad_all = self.pad_all)
        else:
            return sequence

def build_tokenizer(save_name, text=None，max_seq_len=0, pad_all=False):
    if os.path.exists(save_fname):
        print('loading tokenizer:', save_fname)
        tokenizer = pickle.load(open(save_fname, 'rb'))
    else:
        print('building tokenizer:')
        tokenizer = Tokenizer(max_seq_len)
        tokenizer.fit_on_text(text)
        pickle.dump(tokenizer, open(dat_fname, 'wb'))
    return tokenizer

    

## embedding

In [None]:
def _load_word_vec(path, word2idx=None):
    fin = open(path, 'r', encoding='utf-8', newline='\n', errors='ignore')
    word_vec = {}
    for line in fin:
        tokens = line.rstrip().split(' ')
        if word2idx is None or tokens[0] in word2idx.keys():
            word_vec[tokens[0]] = np.asarray(tokens[1:], dtype='float32')
    return word_vec


def build_embedding_matrix(word2idx, embed_dim, load_path, save_path):
    if os.path.exists(save_path):
        print('loading embedding_matrix:', save_path)
        embedding_matrix = pickle.load(open(save_path, 'rb'))
    else:
        print('loading word vectors...')
        embedding_matrix = np.random.uniform(-1e-2,1e-2,size=(len(word2idx) + 2, embed_dim))
        embedding_matrix[0] = 0
        # embedding_matrix = np.zeros((len(word2idx) + 2, embed_dim))  # idx 0 and len(word2idx)+1 are all-zeros
        fname = load_path
        word_vec = _load_word_vec(fname, word2idx=word2idx)
        print('building embedding_matrix:', save_path)
        print('hit rate is: ',float(len(word_vec))/len(word2idx))
        for word, i in word2idx.items():
            vec = word_vec.get(word)
            if vec is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = vec
        pickle.dump(embedding_matrix, open(save_path, 'wb'))
    return embedding_matrix

### dataset

In [None]:
class Dataset_to_modify(data.Dataset):
    def __init__(self,da,sort=False):
        self.context = da[0]
        self.label = da[1]
        if sort:
            self.sort_data()
    def sort_data(self):
        data_len = [len(data) for data in self.context]
        sort_index = torch.sort(torch.tensor(data_len),descending=True)[1].long()
        self.context = [self.context[i] for i in sort_index]
        self.label = [self.label[i] for i in sort_index]
    def __len__(self):
        return len(self.context)
    def __getitem__(self, index):
        return self.context[index] , self.label[index]

def collate_fn_no_shuffle(batch):
    '''
    :param batch: batch[0] is tensor by defalt
    :return:
    '''
    if isinstance(batch[0],collections.Sequence):
        transposed = zip(*batch)
        return [collate_fn_rand(samples) for samples in transposed]
    elif torch.is_tensor(batch[0]):
        # 如果是标量，对应数据的label
        if batch[0].shape == torch.Size([]):
            return torch.stack(batch)

        max_len = max([len(i) for i in batch])
        pad_batch = [torch.tensor(i.numpy().tolist()+[0]*(max_len - len(i))).long()for i in batch]
        pad_batch = torch.stack(pad_batch)
        return pad_batch
    elif isinstance(batch[0],int):
        return torch.tensor(batch).long()

In [None]:
# for random select and pad
mydata = myds(da)
myloader = data.DataLoader(dataset=mydata,batch_size=3,shuffle=True,collate_fn=collate_fn_no_shuffle)
for context ,label in myloader:
    print(context,label)
    print()

# for less pad (shuffle in train function,after get context and label)
import ipdb
lp_dataset = myds(da,sort=True)
lp_loader = data.DataLoader(dataset=mydata,batch_size=3,shuffle=False,collate_fn=collate_fn_no_shuffle)
for context ,label in lp_loader:
    print(context,label)
    print('---------------')
    
    rand_id = torch.randperm(context.shape[0])
    context_shuffle = context[rand_id]
    label_shuffle = label[rand_id]
    print(context_shuffle,label_shuffle)

### arg_parser

In [None]:
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.addHandler(logging.StreamHandler(sys.stdout))

parser = argparse.ArgumentParser()
    parser.add_argument('--model_name', default='bert_spc', type=str)
    parser.add_argument('--dataset', default='twitter', type=str, help='twitter, restaurant, laptop')
    parser.add_argument('--optimizer', default='adam', type=str)
    parser.add_argument('--initializer', default='xavier_uniform_', type=str)
    parser.add_argument('--learning_rate', default=1e-3, type=float, help='try 5e-5, 2e-5 for BERT, 1e-3 for others')
    parser.add_argument('--dropout', default=0.1, type=float)
    parser.add_argument('--l2reg', default=0.01, type=float)
    parser.add_argument('--num_epoch', default=40, type=int, help='try larger number for non-BERT models')
    parser.add_argument('--batch_size', default=32, type=int, help='try 16, 32, 64 for BERT models')
    parser.add_argument('--log_step', default=5, type=int)
    parser.add_argument('--embed_dim', default=300, type=int)
    parser.add_argument('--hidden_dim', default=300, type=int)
    parser.add_argument('--bert_dim', default=768, type=int)
    parser.add_argument('--pretrained_bert_name', default='bert-base-uncased', type=str)
    parser.add_argument('--max_seq_len', default=80, type=int)
    parser.add_argument('--polarities_dim', default=3, type=int)
    parser.add_argument('--hops', default=3, type=int)
    parser.add_argument('--device', default=None, type=str, help='e.g. cuda:0')
    parser.add_argument('--seed', default=None, type=int, help='set seed for reproducibility')
    parser.add_argument('--classifier', action="store_true",  help='default True')
    parser.add_argument('--add_loss', action="store_true",  help='default True')
    parser.add_argument('--tabsa', action="store_true",  help='default True')
    parser.add_argument('--tabsa_with_absa', action="store_true",  help='default True') # if true, then use target
    parser.add_argument('--classifier_with_absa', action="store_true",  help='default True')
    parser.add_argument('--classifier_with_absa_target', action="store_true",  help='default True')
    parser.add_argument('--valset_ratio', default=0, type=float, help='set ratio between 0 and 1 for validation support')
    
    opt = parser.parse_args()

    if opt.seed is not None:
        random.seed(opt.seed)
        numpy.random.seed(opt.seed)
        torch.manual_seed(opt.seed)
        torch.cuda.manual_seed(opt.seed)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    model_classes = {
        'lstm': LSTM
    }
    dataset_files = {
        'sem15':{
            'train':'./datasets/semeval2015/ABSA-15_Restaurants_Train_Final_1197.seg',
            'test':'./datasets/semeval2015/ABSA-15_Restaurants_Test_Gold_524.seg.seg',
            'classifier':'./datasets/semeval2015/ABSA-15_Restaurants_Train_Final_1197_classifier_tabsa.seg',
            'classifier_absa_target':'./datasets/semeval2015/ABSA-15_Restaurants_Train_Final_1197_classifier_tabas_with_target.seg'
        },
        'sem16':{
            'train':'./datasets/semeval2016/ABSA16_Restaurants_Train_SB1_v2_1741.seg',
            'test':'./datasets/semeval2016/EN_REST_SB1_TEST.xml.gold_611.seg',
            'classifier':'./datasets/semeval2016/ABSA16_Restaurants_Train_SB1_v2_1741_classifier_tabsa.seg',
            'classifier_absa_target':'./datasets/semeval2016/ABSA16_Restaurants_Train_SB1_v2_1741_classifier_tabsa_with_target.seg'
        }
    }
    
    input_colses = {
        'lstm': ['text_raw_indices','aspect_indices']
    }
    
    initializers = {
        'xavier_uniform_': torch.nn.init.xavier_uniform_,
        'xavier_normal_': torch.nn.init.xavier_normal,
        'orthogonal_': torch.nn.init.orthogonal_
    }
    optimizers = {
        'adadelta': torch.optim.Adadelta,  # default lr=1.0
        'adagrad': torch.optim.Adagrad,  # default lr=0.01
        'adam': torch.optim.Adam,  # default lr=0.001
        'adamax': torch.optim.Adamax,  # default lr=0.002
        'asgd': torch.optim.ASGD,  # default lr=0.01
        'rmsprop': torch.optim.RMSprop,  # default lr=0.01
        'sgd': torch.optim.SGD,
    }
    opt.model_class = model_classes[opt.model_name]
    opt.dataset_file = dataset_files[opt.dataset]
    opt.inputs_cols = input_colses[opt.model_name]
    opt.initializer = initializers[opt.initializer]
    opt.optimizer = optimizers[opt.optimizer]
    opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') \
        if opt.device is None else torch.device(opt.device)

    log_file = '{}-{}-{}.log'.format(opt.model_name, opt.dataset, strftime("%y%m%d-%H%M", localtime()))
    logger.addHandler(logging.FileHandler(log_file))