In [1]:
import re
import os
import pickle
import numpy as np
import pandas as pd
import logging
import torch
import math
import time
from torch.utils.data import Dataset
from utils.utils import newsample, getId2idx, tokenize, getVocab, my_collate, Partition_Sampler, convert_tokens_to_words
from data.configs.demo import config
from torch.utils.data import DataLoader
from collections import defaultdict

from transformers import BertTokenizer,BertModel,BertTokenizerFast
from utils.MIND import MIND

logger = logging.getLogger(__name__)

In [6]:
t = BertTokenizerFast.from_pretrained('bert-base-uncased')

In [22]:
class MIND(Dataset):
    """ Map Style Dataset for MIND, use bert tokenizer

    Args:
        config(dict): pre-defined dictionary of hyper parameters
        news_file(str): path of news_file
        behaviors_file(str): path of behaviors_file
        shuffle(bool): whether to shuffle the order of impressions
    """

    def __init__(self, config, news_file, behaviors_file, shuffle_pos=False):
        reducer_map = {
            'matching': '',
            'bm25': '_bm25',
            'bow': '',
        }
        # initiate the whole iterator
        self.npratio = config.npratio
        self.shuffle_pos = shuffle_pos
        self.signal_length = config.signal_length
        self.his_size = config.his_size
        self.impr_size = config.impr_size
        self.k = config.k
        self.ascend_history = config.ascend_history
        self.reducer = config.reducer
        self.granularity = config.granularity

        pat = re.search("MIND/(.*_(.*)/)news", news_file)
        self.mode = pat.group(2)

        self.cache_directory = "/".join(["data/cache", config.embedding, pat.group(1)])
        self.news_path = self.cache_directory + "news" + reducer_map[self.reducer] + ".pkl"
        self.behav_path = self.cache_directory + "{}/{}".format(self.impr_size, re.search("(\w*\.)tsv", behaviors_file).group(1) + ".pkl")

        # only preprocess on the master node, the worker can directly load the cache
        if config.rank in [-1, 0]:
            if not os.path.exists(self.behav_path):
                logger.info("encoding user behaviors of {}...".format(behaviors_file))
                os.makedirs(self.cache_directory + str(self.impr_size), exist_ok=True)
                self.behaviors_file = behaviors_file
                try:
                    # VERY IMPORTANT!!!
                    # The nid2idx dictionary must follow the original order of news in news.tsv
                    self.nid2index = getId2idx("data/dictionaries/nid2idx_{}_{}.json".format(config.scale, self.mode))
                except FileNotFoundError:
                    config.construct_nid2idx()
                    self.nid2index = getId2idx("data/dictionaries/nid2idx_{}_{}.json".format(config.scale, self.mode))
                try:
                    self.uid2index = getId2idx("data/dictionaries/uid2idx_{}.json".format(config.scale))
                except FileNotFoundError:
                    config.construct_uid2idx()
                    self.uid2index = getId2idx("data/dictionaries/uid2idx_{}.json".format(config.scale))

                self.init_behaviors()


            if not os.path.exists(self.news_path):
                from transformers import BertTokenizerFast
                logger.info("encoding news of {}...".format(news_file))
                self.news_file = news_file
                self.max_news_length = 512
                # there are only two types of vocabulary
                self.tokenizer = BertTokenizerFast.from_pretrained(config.bert, cache=config.path + "bert_cache/")
                self.nid2index = getId2idx("data/dictionaries/nid2idx_{}_{}.json".format(config.scale, self.mode))

                if config.reducer == "matching":
                    from utils.utils import DoNothing
                    reducer = DoNothing()
                elif config.reducer == "bm25":
                    from utils.utils import BM25
                    reducer = BM25()
                elif config.reducer == "bow":
                    from utils.utils import DoNothing
                    reducer = DoNothing()

                self.init_news(reducer)

        # synchronize all processes
        if config.world_size > 1:
            dist.barrier()

        logger.info("process NO.{} loading cached user behavior from {}".format(config.rank, self.behav_path))
        with open(self.behav_path, "rb") as f:
            behaviors = pickle.load(f)
            for k,v in behaviors.items():
                setattr(self, k, v)
    
        logger.info("process NO.{} loading cached news tokenization from {}".format(config.rank, self.news_path))
        with open(self.news_path, "rb") as f:
            news = pickle.load(f)
            self.encoded_news = news['encoded_news']
            self.attn_mask = news['attn_mask']
            if self.granularity in ['avg','sum']:
                self.subwords = news['subwords_all'][:, :self.signal_length]
            elif self.granularity == 'first':
                self.subwords = news['subwords_first'][:, :self.signal_length]
            else:
                self.subwords = None
        
        if self.reducer == 'bm25':
            try:
                with open(self.cache_directory + "news_matching.pkl", "rb") as f:
                    news = pickle.load(f)
                    self.encoded_news_original = news['encoded_news']
                    self.attn_mask_original = news['attn_mask']
                    if self.granularity in ['avg','sum']:
                        self.subwords_original = news['subwords_all'][:, :self.signal_length]
                    elif self.granularity == 'first':
                        self.subwords_original = news['subwords_first'][:, :self.signal_length]
                    else:
                        self.subwords_original = None
            except FileNotFoundError:
                raise FileNotFoundError("You should always encode with matching reducer before the first time of bm25 reducer")


        if config.reducer == "matching":
            if not config.no_dedup:
                from utils.utils import DeDuplicate
                refiner = DeDuplicate(self.signal_length)
        elif config.reducer == "bm25":
            from utils.utils import Truncate
            refiner = Truncate(self.k + 1)
        elif config.reducer == "bow":
            from utils.utils import CountFreq
            refiner = CountFreq(self.signal_length)
        else:
            refiner = None

        logger.info("reducing news of {}...".format(news_file))
        self.init_refinement(refiner)


    def init_news(self, reducer):
        """
            1. encode news text to tokens
            2. rerank words in the news text according to reduction methods
            2. get subword indices

            No assignment to self
        """
        articles = [""]
        subwords_all = [[]]
        subwords_first = [[]]
        with open(self.news_file, "r", encoding="utf-8") as rd:
            for idx in rd:
                nid, vert, subvert, title, ab, url, _, _ = idx.strip("\n").split("\t")
                article = " ".join(["[CLS]", title, ab, vert, subvert])
                tokens = self.tokenizer.tokenize(article)[:512]
                # unify subwords
                words = convert_tokens_to_words(tokens)
                articles.append(' '.join(words))
        
        # rank words according to reduction rules
        articles = reducer(articles)

        article_toks = []
        attention_masks = []
        for article in articles:
            tokens = self.tokenizer.tokenize(article)
            
            # maintain subword entry
            subword_all = []
            # mask subword entry
            subword_first = []

            i = -1
            j = -1
            for token in tokens:
                if token.startswith('##'):
                    j += 1
                    # subword.append([0,0])
                    subword_all.append([i,j])
                    subword_first.append([0,0])

                else:
                    i += 1
                    j += 1
                    subword_all.append([i,j])
                    subword_first.append([i,j])

            pad_length = self.max_news_length - len(tokens)

            article_toks.append(self.tokenizer.convert_tokens_to_ids(tokens[:self.max_news_length]) + [0] * pad_length)
            attention_masks.append([1] * min(len(tokens), self.max_news_length) + [0] * pad_length)
            subwords_all.append(subword_all)
            subwords_first.append(subword_first)

        # encode news
        encoded_news = np.asarray(article_toks)
        attn_mask = np.asarray(attention_masks)

        for i,subword in enumerate(subwords_all):
            pad_length = self.max_news_length - len(subword)

            subwords_all[i].extend([[0,0]] * pad_length)
            subwords_first[i].extend([[0,0]] * pad_length)

        subwords_all = np.asarray(subwords_all)
        subwords_first = np.asarray(subwords_first)

        with open(self.news_path, "wb") as f:
            pickle.dump(
                {
                    "encoded_news": encoded_news,
                    "subwords_first": subwords_first,
                    "subwords_all": subwords_all,
                    "attn_mask": attn_mask
                },
                f
            )


    def init_behaviors(self):
        """
            init behavior logs given behaviors file.
        """
        # list of list of history news index
        histories = []
        # list of user index
        uindexes = []
        # list of impression indexes
        # self.impr_indexes = []

        impr_index = 0

        # only store positive behavior
        if self.mode == "train":
            # list of lists, each list represents a
            imprs = []
            negatives = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]

                    # user will always in uid2index
                    uindex = self.uid2index[uid]
                    # store negative samples of each impression
                    negative = []

                    for news, label in zip(impr_news, labels):
                        if label == 1:
                            imprs.append((impr_index, news))
                        else:
                            negative.append(news)

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    negatives.append(negative)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.negatives = negatives
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "negatives": self.negatives,
                "uindexes": self.uindexes
            }

        # store every behavior
        elif self.mode == "dev":
            # list of every cdd news index along with its impression index and label
            imprs = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i.split("-")[0]] for i in impr.split()]
                    labels = [int(i.split("-")[1]) for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for i in range(0, len(impr_news), self.impr_size):
                        imprs.append((impr_index, impr_news[i:i+self.impr_size], labels[i:i+self.impr_size]))

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "uindexes": self.uindexes
            }

        # store every behavior
        elif self.mode == "test":
            # list of every cdd news index along with its impression index and label
            imprs = []

            with open(self.behaviors_file, "r", encoding="utf-8") as rd:
                for idx in rd:
                    _, uid, time, history, impr = idx.strip("\n").split("\t")

                    history = [self.nid2index[i] for i in history.split()]

                    impr_news = [self.nid2index[i] for i in impr.split()]
                    # user will always in uid2index
                    uindex = self.uid2index[uid]

                    # store every impression
                    for i in range(0, len(impr_news), self.impr_size):
                        imprs.append((impr_index, impr_news[i:i+self.impr_size]))

                    # 1 impression correspond to 1 of each of the following properties
                    histories.append(history)
                    uindexes.append(uindex)

                    impr_index += 1

            self.imprs = imprs
            self.histories = histories
            self.uindexes = uindexes

            save_dict = {
                "imprs": self.imprs,
                "histories": self.histories,
                "uindexes": self.uindexes
            }

        with open(self.behav_path, "wb") as f:
            pickle.dump(save_dict, f)
    

    def init_refinement(self, refiner):
        """
            token level refinement, determined by reducer
            
            matching -> deduplicate
            bm25 -> truncate
            bow -> count
        """
        if not refiner:
            return

        refined_news, refined_mask = refiner(self.encoded_news, self.attn_mask)
        if self.reducer == 'matching':
            self.encoded_news = refined_news
            self.attn_mask_dedup = refined_mask
            # truncate the attention mask
            self.attn_mask = self.attn_mask[:, :self.signal_length]
        
        elif self.reducer == 'bm25':
            self.encoded_news = refined_news
            self.attn_mask = refined_mask
            # truncate the original text tokens
            self.encoded_news_original = self.encoded_news_original[:, :self.signal_length]
            self.attn_mask_original = self.attn_mask_original[:, :self.signal_length]

        elif self.reducer == 'bow':
            self.encoded_news = refined_news
            self.attn_mask = refined_mask

    def __len__(self):
        """
            return length of the whole dataset
        """
        return len(self.imprs)

    def __getitem__(self,index):
        """ return data
        Args:
            index: the index for stored impression

        Returns:
            back_dic: dictionary of data slice
        """

        impr = self.imprs[index] # (impression_index, news_index)
        impr_index = impr[0]
        impr_news = impr[1]


        user_index = [self.uindexes[impr_index]]

        # each time called to return positive one sample and its negative samples
        if self.mode == "train":
            # user"s unhis news in the same impression
            negs = self.negatives[impr_index]
            neg_list, neg_num = newsample(negs, self.npratio)

            cdd_ids = [impr_news] + neg_list
            cdd_size = self.npratio + 1

            label = np.asarray([1] + [0] * self.npratio)

            if self.shuffle_pos:
                s = np.arange(0, len(label), 1)
                np.random.shuffle(s)
                cdd_ids = np.asarray(cdd_ids)[s]
                label = np.asarray(label)[s]

            label = np.arange(0, len(cdd_ids), 1)[label == 1][0]

            his_ids = self.histories[impr_index][:self.his_size]

            cdd_mask = torch.zeros((cdd_size, 1))
            cdd_mask[:neg_num + 1] = 1

            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            cdd_encoded_index = self.encoded_news[cdd_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "user_index": np.asarray(user_index),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "cdd_mask": cdd_mask,
                "his_mask": his_mask,
                "label": label
            }

            if self.subwords is not None:
                cdd_subword_index = self.subwords[cdd_ids]
                his_subword_index = self.subwords[his_ids]
                back_dic["cdd_subword_index"] = cdd_subword_index
                back_dic["his_subword_index"] = his_subword_index

            if self.reducer == "matching":
                his_attn_mask_dedup = self.attn_mask_dedup[his_ids]
                back_dic["his_refined_mask"] = his_attn_mask_dedup

            elif self.reducer == "bm25":
                back_dic["cdd_encoded_index"] = self.encoded_news_original[cdd_ids]
                # placeholder
                back_dic["his_refined_mask"] = None

            elif self.reducer == "bow":
                # placeholder
                back_dic["his_refined_mask"] = None

            return back_dic

        # each time called return one sample, and no labels
        elif self.mode == "dev":
            cdd_ids = impr_news
            cdd_size = len(cdd_ids)

            his_ids = self.histories[impr_index][:self.his_size]
            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            user_index = [self.uindexes[impr_index]]
            label = impr[2]

            cdd_encoded_index = self.encoded_news[cdd_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "impr_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "his_mask": his_mask,
                "label": np.asarray(label)
            }

            if self.subwords is not None:
                cdd_subword_index = self.subwords[cdd_ids]
                his_subword_index = self.subwords[his_ids]
                back_dic["cdd_subword_index"] = cdd_subword_index
                back_dic["his_subword_index"] = his_subword_index

            if self.reducer == "matching":
                his_attn_mask_dedup = self.attn_mask_dedup[his_ids]
                back_dic["his_refined_mask"] = his_attn_mask_dedup

            elif self.reducer == "bm25":
                back_dic["cdd_encoded_index"] = self.encoded_news_original[cdd_ids]
                # placeholder
                back_dic["his_refined_mask"] = None

            elif self.reducer == "bow":
                # placeholder
                back_dic["his_refined_mask"] = None

            return back_dic

        elif self.mode == "test":
            cdd_ids = impr_news
            cdd_size = len(cdd_ids)

            his_ids = self.histories[impr_index][:self.his_size]
            # true means the corresponding history news is padded
            his_mask = torch.zeros((self.his_size, 1), dtype=bool)
            his_mask[:len(his_ids)] = 1

            if self.ascend_history:
                his_ids = his_ids + [0] * (self.his_size - len(his_ids))
            else:
                his_ids = his_ids[::-1] + [0] * (self.his_size - len(his_ids))

            user_index = [self.uindexes[impr_index]]

            cdd_encoded_index = self.encoded_news[cdd_ids]
            cdd_attn_mask = self.attn_mask[cdd_ids]
            his_encoded_index = self.encoded_news[his_ids]
            his_attn_mask = self.attn_mask[his_ids]

            back_dic = {
                "impr_index": impr_index + 1,
                "user_index": np.asarray(user_index),
                "cdd_id": np.asarray(cdd_ids),
                "his_id": np.asarray(his_ids),
                "cdd_encoded_index": cdd_encoded_index,
                "his_encoded_index": his_encoded_index,
                "cdd_attn_mask": cdd_attn_mask,
                "his_attn_mask": his_attn_mask,
                "his_mask": his_mask,
            }

            if self.subwords is not None:
                cdd_subword_index = self.subwords[cdd_ids]
                his_subword_index = self.subwords[his_ids]
                back_dic["cdd_subword_index"] = cdd_subword_index
                back_dic["his_subword_index"] = his_subword_index

            if self.reducer == "matching":
                his_attn_mask_dedup = self.attn_mask_dedup[his_ids]
                back_dic["his_refined_mask"] = his_attn_mask_dedup

            elif self.reducer == "bm25":
                back_dic["cdd_encoded_index"] = self.encoded_news_original[cdd_ids]
                # placeholder
                back_dic["his_refined_mask"] = None

            elif self.reducer == "bow":
                # placeholder
                back_dic["his_refined_mask"] = None

            return back_dic

        else:
            raise ValueError("Mode {} not defined".format(self.mode))

In [23]:
# config.reducer = 'bm25'
config.reducer = 'bow'
# config.reducer = 'matching'

# config.signal_length = 10
# config.scale = 'large'
# config.impr_size = 100
# config.mode = 'test'

path = config.path + 'MIND/MINDdemo_train/'
a = MIND(config, path + 'news.tsv', path + 'behaviors.tsv')

[2021-08-31 19:54:51,838] INFO (__main__) process NO.0 loading cached user behavior from data/cache/bert/MINDdemo_train/10/behaviors..pkl
[2021-08-31 19:54:51,856] INFO (__main__) process NO.0 loading cached news tokenization from data/cache/bert/MINDdemo_train/news.pkl
[2021-08-31 19:54:52,609] INFO (__main__) reducing news of ../../../Data/MIND/MINDdemo_train/news.tsv...
[2021-08-31 19:54:52,610] INFO (utils.utils) reducing to Bag-of-Words...


In [28]:
a[1]['his_encoded_index'].shape

(50, 100, 2)

In [9]:
r = np.array([[[1,1],[2,2],[2,3]],[[1,1],[1,2],[1,3]]])
r[r[:,:,0] == 1]

array([[1, 1],
       [1, 1],
       [1, 2],
       [1, 3]])

In [7]:
t.convert_ids_to_tokens(a.encoded_news[3][:110])

['[CLS]',
 'the',
 'cost',
 'of',
 'trump',
 "'",
 's',
 'aid',
 'freeze',
 'in',
 'the',
 'trenches',
 'of',
 'ukraine',
 "'",
 's',
 'war',
 'lt',
 '.',
 'ivan',
 'mo',
 '##lch',
 '##ane',
 '##ts',
 'peeked',
 'over',
 'a',
 'parapet',
 'of',
 'sand',
 'bags',
 'at',
 'the',
 'front',
 'line',
 'of',
 'the',
 'war',
 'in',
 'ukraine',
 '.',
 'next',
 'to',
 'him',
 'was',
 'an',
 'empty',
 'helmet',
 'propped',
 'up',
 'to',
 'trick',
 'sniper',
 '##s',
 ',',
 'already',
 'per',
 '##for',
 '##ated',
 'with',
 'multiple',
 'holes',
 '.',
 'news',
 'news',
 '##world',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PA

In [4]:
a.subwords[3][:110]

array([[ 0,  0],
       [ 1,  1],
       [ 2,  2],
       [ 3,  3],
       [ 4,  4],
       [ 5,  5],
       [ 6,  6],
       [ 7,  7],
       [ 8,  8],
       [ 9,  9],
       [10, 10],
       [11, 11],
       [12, 12],
       [13, 13],
       [14, 14],
       [15, 15],
       [16, 16],
       [17, 17],
       [18, 18],
       [19, 19],
       [20, 20],
       [20, 21],
       [20, 22],
       [20, 23],
       [21, 24],
       [22, 25],
       [23, 26],
       [24, 27],
       [25, 28],
       [26, 29],
       [27, 30],
       [28, 31],
       [29, 32],
       [30, 33],
       [31, 34],
       [32, 35],
       [33, 36],
       [34, 37],
       [35, 38],
       [36, 39],
       [37, 40],
       [38, 41],
       [39, 42],
       [40, 43],
       [41, 44],
       [42, 45],
       [43, 46],
       [44, 47],
       [45, 48],
       [46, 49],
       [47, 50],
       [48, 51],
       [49, 52],
       [49, 53],
       [50, 54],
       [51, 55],
       [52, 56],
       [52, 57],
       [52, 58

In [3]:
loader1 = DataLoader(a, batch_size=1, pin_memory=False, num_workers=0, drop_last=False, shuffle=False, sampler=Partition_Sampler(a,2,0))
records1 = list(loader1

In [7]:
records1[0]['his_attn_mask']

tensor([[[1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]])

In [3]:
dic = pickle.load(open('/data/workspace/Peitian/Code/Document-Reduction/Code/data/cache/bert/MINDdemo_train/news.pkl', 'rb'))

In [4]:
sb = dic['subwords_first']
sb

array([list([]),
       list([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [0, 0], [17, 18], [18, 19], [19, 20], [20, 21], [21, 22], [22, 23], [23, 24], [24, 25], [25, 26], [26, 27], [27, 28], [28, 29], [29, 30], [30, 31], [31, 32], [32, 33], [0, 0], [0, 0]]),
       list([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [0, 0], [20, 21], [21, 22], [22, 23], [23, 24], [24, 25], [25, 26], [26, 27], [27, 28], [28, 29], [0, 0], [0, 0]]),
       ...,
       list([[0, 0], [1, 1], [2, 2], [3, 3], [4, 4], [5, 5], [6, 6], [7, 7], [8, 8], [9, 9], [10, 10], [11, 11], [12, 12], [13, 13], [14, 14], [15, 15], [16, 16], [17, 17], [18, 18], [19, 19], [0, 0], [20, 21], [21, 22], [22, 23], [23, 24], [24, 25], [25, 26], [26, 27], [27, 28], [28, 29], [29, 30], [30, 31], [3

In [17]:
np.unique(a[:,0]),a

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]),
 array([[0, 0],
        [1, 1],
        [2, 2],
        [2, 3],
        [3, 4],
        [4, 5],
        [5, 6],
        [6, 7],
        [7, 8],
        [8, 9]]))

In [4]:
t.tokenize("I don't give a fuck")

['i', 'don', "'", 't', 'give', 'a', 'fuck']