In [9]:
import os

# path : store the current path to convert back to it later
# path = os.getcwd()
# os.chdir(os.path.join('..', '..', 'notebook_format'))

# from formats import load_style
# load_style(plot_style=False)

import random

In [6]:
# os.chdir(path)

# 1. magic for inline plot
# 2. magic to print version
# 3. magic so that the notebook will reload external python modules
# 4. magic to enable retina (high resolution) plots
# https://gist.github.com/minrk/3301035
%matplotlib inline
# %load_ext watermark
# %load_ext autoreload
# %autoreload 2
# %config InlineBackend.figure_format='retina'

import time
import fasttext
import tokenizers

# %watermark -a 'Ethen' -d -t -v -p numpy,tokenizers

In [40]:
from tokenizers import ByteLevelBPETokenizer
from tokenizers.implementations import BaseTokenizer
from tokenizers import Tokenizer

In [48]:
# https://github.com/ethen8181/machine-learning/tree/master/deep_learning/multi_label/fasttext_module
from fasttext_module.model import FasttextPipeline


<h1 id="MultiLabel-Text-Classification-with-FastText">MultiLabel Text Classification with FastText<a class="anchor-link" href="#MultiLabel-Text-Classification-with-FastText">¶</a></h1>


<pre><code>__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?</code></pre>

<p> This file format is expected by <a href="https://fasttext.cc/">Fasttext</a>, the library we'll be using to train our tag classifier.</p>


<h2 id="Quick-Introduction-to-Fasttext">Quick Introduction to Fasttext<a class="anchor-link" href="#Quick-Introduction-to-Fasttext">¶</a></h2>


<ul>
<li>Given a word, predict me which other words should go around (skipgram).</li>
<li>Given a sentence with a missing word, find me the missing word (cbow).</li>
<li>Given a sentence, tell me which label corresponds to this sentence (classification).</li>
</ul>


<h2 id="Data-Preparation">Data Preparation<a class="anchor-link" href="#Data-Preparation">¶</a></h2>


In [7]:
# download the data and un-tar it under the 'data' folder

# -P or --directory-prefix specifies which directory to download the data to
!wget https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz -P data
    
# -C specifies the target directory to extract an archive to
!tar xvzf data/cooking.stackexchange.tar.gz -C data

wget: /data2/wangyh/anaconda3/lib/libuuid.so.1: no version information available (required by wget)
--2021-02-05 13:54:17--  https://dl.fbaipublicfiles.com/fasttext/data/cooking.stackexchange.tar.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 172.67.9.4, 104.22.75.142, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 457609 (447K) [application/x-tar]
Saving to: ‘data/cooking.stackexchange.tar.gz’


2021-02-05 13:54:20 (493 KB/s) - ‘data/cooking.stackexchange.tar.gz’ saved [457609/457609]

cooking.stackexchange.id
cooking.stackexchange.txt
readme.txt


In [8]:
!head -n 3 data/cooking.stackexchange.txt

__label__sauce __label__cheese How much does potato starch affect a cheese sauce recipe?
__label__food-safety __label__acidity Dangerous pathogens capable of growing in acidic environments
__label__cast-iron __label__stove How do I cover up the white spots on my cast iron stove?


In [10]:
def train_test_split_file(input_path: str,
                          output_path_train: str,
                          output_path_test: str,
                          test_size: float,
                          random_state: int=1234,
                          encoding: str='utf-8',
                          verbose: bool=True):
    
    random.seed(random_state)

    # we record the number of data in the training and test
    count_train = 0
    count_test = 0
    train_range = 1 - test_size

    with open(input_path, encoding=encoding) as f_in, \
         open(output_path_train, 'w', encoding=encoding) as f_train, \
         open(output_path_test, 'w', encoding=encoding) as f_test:

        for line in f_in:
            random_num = random.random()
            if random_num < train_range:
                f_train.write(line)
                count_train += 1
            else:
                f_test.write(line)
                count_test += 1

    if verbose:
        print('train size: ', count_train)
        print('test size: ', count_test)

In [11]:
def prepend_file_name(path: str, name: str) -> str:
    """
    e.g. data/cooking.stackexchange.txt
    prepend 'train' to the base file name
    data/train_cooking.stackexchange.txt
    """
    directory = os.path.dirname(path)
    file_name = os.path.basename(path)
    return os.path.join(directory, name + '_' + file_name)

In [12]:
data_dir = 'data'
test_size = 0.2
input_path = os.path.join(data_dir, 'cooking.stackexchange.txt')
input_path_train = prepend_file_name(input_path, 'train')
input_path_test = prepend_file_name(input_path, 'test')
random_state = 1234
encoding = 'utf-8'

train_test_split_file(input_path, input_path_train, input_path_test, test_size, random_state, encoding)
print('train path: ', input_path_train)
print('test path: ', input_path_test)

train size:  12297
test size:  3107
train path:  data/train_cooking.stackexchange.txt
test path:  data/test_cooking.stackexchange.txt



<h2 id="Model-Training">Model Training<a class="anchor-link" href="#Model-Training">¶</a></h2>



<p>We can refer to the full list of parameters from <a href="https://fasttext.cc/docs/en/python-module.html#train_supervised-parameters">Fasttext's documentation page</a>. Like with all machine learning models, feel free to experiment with various hyperparameters, and see which one leads to better performance.</p>


In [13]:
# lr = learning rate
# lrUpdateRate similar to batch size
fasttext_params = {
    'input': input_path_train,
    'lr': 0.1,
    'lrUpdateRate': 1000,
    'thread': 8,
    'epoch': 10,
    'wordNgrams': 1,
    'dim': 100,
    'loss': 'ova'
}
model = fasttext.train_supervised(**fasttext_params)

print('vocab size: ', len(model.words))
print('label size: ', len(model.labels))
print('example vocab: ', model.words[:5])
print('example label: ', model.labels[:5])

vocab size:  14496
label size:  733
example vocab:  ['</s>', 'to', 'a', 'How', 'the']
example label:  ['__label__baking', '__label__food-safety', '__label__substitutions', '__label__equipment', '__label__bread']



<p>Although not used here, fasttext has a parameter called <code>bucket</code>. It can be a bit unintuitive what the parameter controls. We note down the <a href="https://github.com/facebookresearch/fastText/issues/641">explanation provided by the package maintainer</a>.</p>
<blockquote><p>The size of the model will increase linearly with the number of buckets. The size of the input matrix is DIM x (VS + BS), where VS is the number of words in the vocabulary and BS is the number of buckets. The number of buckets does not have other influence on the model size.
The buckets are used for hashed features (such as character ngrams or word ngrams), which are used in addition to word features. In the input matrix, each word is represented by a vector, and the additional ngram features are represented by a fixed number of vectors (which corresponds to the number of buckets).</p>
</blockquote>
<p>The loss function that we've specified is one versus all, <code>ova</code> for short. This type of loss function handles the multiple labels by building independent binary classifiers for each label.</p>
<p>Upon training the model, we can take a look at the prediction generated by the model via passing a question to the <code>.predict</code> method.</p>


In [14]:
text = 'How much does potato starch affect a cheese sauce recipe?'
model.predict(text, k=2)

(('__label__sauce', '__label__cheese'), array([0.80807722, 0.56986266]))


<p>The annotated tags for this question were <code>__label__sauce</code> and <code>__label__cheese</code>. Meaning we got both the prediction correct when asking for the top 2 tags. i.e. the precision@2 (precision at 2) for this example is 100%.</p>


In [15]:
text = 'Dangerous pathogens capable of growing in acidic environments'
model.predict(text, k=2)

(('__label__food-safety', '__label__storage-method'),
 array([0.21734752, 0.06561483]))


<p>In this example, the annotated tags were <code>__label__food-safety</code> and <code>__label__acidity</code>. In other words, 1 of our predicted tag was wrong, hence the precision@2 is 50%.</p>
<p>Notice the second prediction's score is pretty low, when calling the <code>.predict</code> method, we can also provide a threshold to cutoff predictions lower than that value.</p>


In [16]:
text = 'Dangerous pathogens capable of growing in acidic environments'
model.predict(text, k=2, threshold=0.1)

(('__label__food-safety',), array([0.21734752]))


<p>The <code>.predict</code> method also supports batch prediction, where we pass in a list of text.</p>


In [17]:
texts = [
    'How much does potato starch affect a cheese sauce recipe?',
    'Dangerous pathogens capable of growing in acidic environments'
]

batch_results = model.predict(texts, k=2)
batch_results

([['__label__sauce', '__label__cheese'],
  ['__label__food-safety', '__label__storage-method']],
 [array([0.8080772 , 0.56986266], dtype=float32),
  array([0.21734752, 0.06561483], dtype=float32)])


<p>To perform this type of evaluation all together on our train and test file, we can leverage the <code>.test</code> method from the model to evaluate the overall precision and recall metrics.</p>


In [19]:
?model.test

In [18]:
def print_results(model, input_path, k):
    num_records, precision_at_k, recall_at_k = model.test(input_path, k)
    f1_at_k = 2 * (precision_at_k * recall_at_k) / (precision_at_k + recall_at_k)

    print("records\t{}".format(num_records))
    print("Precision@{}\t{:.3f}".format(k, precision_at_k))
    print("Recall@{}\t{:.3f}".format(k, recall_at_k))
    print("F1@{}\t{:.3f}".format(k, f1_at_k))
    print()

In [20]:
for k in range(1, 3):
    print('train metrics:')
    print_results(model, input_path_train, k)

    print('test metrics:')
    print_results(model, input_path_test, k)

train metrics:
records	12297
Precision@1	0.491
Recall@1	0.213
F1@1	0.297

test metrics:
records	3107
Precision@1	0.411
Recall@1	0.177
F1@1	0.248

train metrics:
records	12297
Precision@2	0.363
Recall@2	0.315
F1@2	0.337

test metrics:
records	3107
Precision@2	0.310
Recall@2	0.268
F1@2	0.287




<h2 id="Tokenizer">Tokenizer<a class="anchor-link" href="#Tokenizer">¶</a></h2>



<p>using <a href="https://nbviewer.jupyter.org/github/ethen8181/machine-learning/blob/master/deep_learning/subword/bpe.ipynb">Byte Pair Encoding</a> to tokenize the raw text into subwords.</p>


In [21]:
FASTTEXT_LABEL = '__label__'

In [22]:
def create_text_file(input_path: str, output_path: str, encoding: str='utf-8'):
    with open(input_path, encoding=encoding) as f_in, \
         open(output_path, 'w', encoding=encoding) as f_out:

        for line in f_in:
            try:
                tokens = []
                for token in line.split(' '):
                    if FASTTEXT_LABEL not in token:
                        tokens.append(token)

                text = ' '.join(tokens)
            except ValueError as e:
                continue

            f_out.write(text)

In [23]:
text_input_path = prepend_file_name(input_path_train, 'text')
print('text only train file: ', text_input_path)

create_text_file(input_path_train, text_input_path)

text only train file:  data/text_train_cooking.stackexchange.txt


In [24]:
!head -n 3 data/text_train_cooking.stackexchange.txt

Dangerous pathogens capable of growing in acidic environments
How do I cover up the white spots on my cast iron stove?
What's the purpose of a bread box?



<p>For our tokenizer, we'll be using <a href="https://github.com/huggingface/tokenizers/tree/master/bindings/python">HuggingFace's Tokenizers</a>. Similar to Fasttext, the input expects the path to our text.</p>


In [26]:
tokenizer = ByteLevelBPETokenizer(lowercase=True)

tokenizer.train(
    text_input_path,
    vocab_size=10000,
    min_frequency=2,
    show_progress=True
)


<p>After training the tokenizer, we can use it to tokenize any new incoming text.</p>


In [27]:
text = 'How much does potato starch affect a cheese sauce recipe?'
encoded_text = tokenizer.encode(text)
encoded_text

Encoding(num_tokens=11, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [28]:
encoded_text.tokens

['how',
 'Ġmuch',
 'Ġdoes',
 'Ġpotato',
 'Ġstarch',
 'Ġaffect',
 'Ġa',
 'Ġcheese',
 'Ġsauce',
 'Ġrecipe',
 '?']


<p>We now read in the original training/test file and tokenized the text part with our tokenizer, and write it back to a new file. We'll train the fasttext model on this new tokenized file.</p>


In [30]:
def tokenize_text(tokenizer: BaseTokenizer, text: str) -> str:
    """
    Given the raw text, tokenize it using the trained tokenizer and
    outputs the tokenized tetx.
    """
    return ' '.join(tokenizer.encode(text).tokens)

In [31]:
def create_tokenized_file(input_path: str, output_path: str,
                          tokenizer: BaseTokenizer, encoding: str='utf-8'):
    
    with open(input_path, encoding=encoding) as f_in, \
         open(output_path, 'w', encoding=encoding) as f_out:

        for line in f_in:
            try:
                # the labels remains untouched during the preprocessing step as its
                # already in a format that fasttext can consume
                tokens = []
                labels = []
                
                for token in line.split(' '):
                    if FASTTEXT_LABEL in token:
                        labels.append(token)
                    else:
                        tokens.append(token)

                text = ' '.join(tokens)
                label = ' '.join(labels)
            except ValueError as e:
                continue

            tokenized_text = tokenize_text(tokenizer, text)
            new_line = label + ' ' + tokenized_text
            
            f_out.write(new_line)
            f_out.write('\n')

In [32]:
input_path_train_tokenized = prepend_file_name(input_path_train, 'tokenized')
print('tokenized train file: ', input_path_train_tokenized)
create_tokenized_file(input_path_train, input_path_train_tokenized, tokenizer)

input_path_test_tokenized = prepend_file_name(input_path_test, 'tokenized')
print('tokenized test file: ', input_path_test_tokenized)
create_tokenized_file(input_path_test, input_path_test_tokenized, tokenizer)

tokenized train file:  data/tokenized_train_cooking.stackexchange.txt
tokenized test file:  data/tokenized_test_cooking.stackexchange.txt


In [33]:
!head -n 3 data/tokenized_train_cooking.stackexchange.txt

__label__food-safety __label__acidity dang er ous Ġpat hog ens Ġcapable Ġof Ġgrowing Ġin Ġacidic Ġenviron ments Ċ
__label__cast-iron __label__stove how Ġdo Ġi Ġcover Ġup Ġthe Ġwhite Ġspots Ġon Ġmy Ġcast Ġiron Ġstove ? Ċ
__label__storage-method __label__equipment __label__bread what 's Ġthe Ġpurpose Ġof Ġa Ġbread Ġbox ? Ċ


In [34]:
fasttext_params['input'] = input_path_train_tokenized
tokenized_model = fasttext.train_supervised(**fasttext_params)

print('vocab size: ', len(tokenized_model.words))
print('label size: ', len(tokenized_model.labels))
print('example vocab: ', tokenized_model.words[:5])
print('example label: ', tokenized_model.labels[:5])

vocab size:  7342
label size:  733
example vocab:  ['</s>', 'Ċ', '?', 'Ġto', 'Ġa']
example label:  ['__label__baking', '__label__food-safety', '__label__substitutions', '__label__equipment', '__label__bread']



<p>We print out the evaluation metric for the new model based on tokenized text and compare it with the original model that was trained on the raw text.</p>


In [35]:
for k in range(1, 3):
    print('train metrics:')
    print_results(tokenized_model, input_path_train_tokenized, k)

    print('test metrics:')
    print_results(tokenized_model, input_path_test_tokenized, k)

train metrics:
records	12297
Precision@1	0.498
Recall@1	0.216
F1@1	0.301

test metrics:
records	3107
Precision@1	0.454
Recall@1	0.196
F1@1	0.274

train metrics:
records	12297
Precision@2	0.366
Recall@2	0.318
F1@2	0.341

test metrics:
records	3107
Precision@2	0.331
Recall@2	0.285
F1@2	0.306



In [36]:
for k in range(1, 3):
    print('train metrics:')
    print_results(model, input_path_train, k)

    print('test metrics:')
    print_results(model, input_path_test, k)

train metrics:
records	12297
Precision@1	0.491
Recall@1	0.213
F1@1	0.297

test metrics:
records	3107
Precision@1	0.411
Recall@1	0.177
F1@1	0.248

train metrics:
records	12297
Precision@2	0.363
Recall@2	0.315
F1@2	0.337

test metrics:
records	3107
Precision@2	0.310
Recall@2	0.268
F1@2	0.287




<p>Both the tokenizer and fasttext model has API to save and load the model.</p>


In [38]:
directory = 'model'
if not os.path.isdir(directory):
    os.makedirs(directory, exist_ok=True)

tokenizer_checkpoint = os.path.join(directory, 'tokenizer.json')
tokenizer.save(tokenizer_checkpoint)

In [39]:
tokenized_model_checkpoint = os.path.join(directory, 'tokenized_cooking_model.fasttext')
tokenized_model.save_model(tokenized_model_checkpoint)

In [41]:
loaded_tokenizer = Tokenizer.from_file(tokenizer_checkpoint)
loaded_model = fasttext.load_model(tokenized_model_checkpoint)

In [42]:
encoded_text = loaded_tokenizer.encode(text)
encoded_text.tokens

['how',
 'Ġmuch',
 'Ġdoes',
 'Ġpotato',
 'Ġstarch',
 'Ġaffect',
 'Ġa',
 'Ġcheese',
 'Ġsauce',
 'Ġrecipe',
 '?']


<p>Now, to predict new labels for incoming text, we need to tokenize the raw text before feeding it to the model.</p>


In [43]:
def predict(text, tokenizer, model, k, threshold=0.1):
    tokenized_text = tokenize_text(tokenizer, text)
    return model.predict(tokenized_text, k=k, threshold=threshold)

In [44]:
text = 'Which baking dish is best to bake a banana bread ?'
predict(text, loaded_tokenizer, loaded_model, k=3)

(('__label__baking', '__label__bread', '__label__cookies'),
 array([0.98968184, 0.96886617, 0.19683622]))

In [45]:
def batch_predict(texts, tokenizer, model, k, threshold=0.1):
    tokenized_texts = [tokenize_text(tokenizer, text) for text in texts]
    return model.predict(tokenized_texts, k=k, threshold=threshold)

In [46]:
texts = [
    'Which baking dish is best to bake a banana bread ?',
    'Why not put knives in the dishwasher?',
    'How do I cover up the white spots on my cast iron stove?'
]
batch_results = batch_predict(texts, loaded_tokenizer, loaded_model, k=2, threshold=0.0)
batch_results

([['__label__baking', '__label__bread'],
  ['__label__equipment', '__label__coffee'],
  ['__label__cast-iron', '__label__equipment']],
 [array([0.98968184, 0.96886617], dtype=float32),
  array([0.37023538, 0.16452648], dtype=float32),
  array([0.6150979 , 0.35578486], dtype=float32)])


<h2 id="Fasttext-Text-Classification-Pipeline">Fasttext Text Classification Pipeline<a class="anchor-link" href="#Fasttext-Text-Classification-Pipeline">¶</a></h2>



<p>The following provides a sample code on how to wrap a <code>FasttextPipeline</code> class on top of the fasttext model to allow for hyperparameter tuning. The <a href="https://github.com/ethen8181/machine-learning/tree/master/deep_learning/multi_label/fasttext_module"><code>fasttext_module</code></a> can be found here for those interested.</p>


In [49]:
model_id = 'cooking'

fasttext_params = {
    "lr": 0.1,
    "lrUpdateRate": 1000,
    "thread": 6,
    "epoch": 10,
    "wordNgrams": 1,
    "dim": 100,
    "loss": "ova"
}

fasttext_hyper_params = {
    'dim': [80, 100],
    'epoch': [15]
}

fasttext_search_parameters =  {
    "n_iter": 2,
    "n_jobs": 1,
    "verbose": 1,
    "scoring": "f1@1",
    "random_state": 1234
}

val_size = 0.1
split_random_state = 1234

In [50]:
fasttext_pipeline = FasttextPipeline(model_id,
                                     fasttext_params,
                                     fasttext_hyper_params,
                                     fasttext_search_parameters)

# fit the pipeline by giving it the training text file and specify the
# size of the validation split that will be used for hyperparameter tuning

# note that here the file in input_path_train should already be tokenized and
# in the format that fasttext expects
fasttext_pipeline.fit_file(input_path_train, val_size, split_random_state)

# check the hyperparameter tuning result stored in a pandas DataFrame
fasttext_pipeline.df_tune_results_

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   10.5s finished


Unnamed: 0,params,epoch,dim,train_precision@1,train_recall@1,train_f1@1,test_precision@1,test_recall@1,test_f1@1
0,"{'epoch': 15, 'dim': 80}",15,80,0.618,0.268,0.374,0.482,0.21,0.292
1,"{'epoch': 15, 'dim': 100}",15,100,0.617,0.268,0.373,0.477,0.207,0.289


In [52]:
# save and load the model back
model_checkpoint_dir = fasttext_pipeline.save('model')
fasttext_pipeline_loaded = FasttextPipeline.load(model_checkpoint_dir)

In [53]:
# compute the evaluation metric on the train and test text dataset
k = 1
score_str_train = fasttext_pipeline.score_str(input_path_train, k)
score_str_test = fasttext_pipeline.score_str(input_path_test, k)
print('train' + score_str_train)
print('test' + score_str_test)

train metric - num_records: 12297, precision@1: 0.644, recall@1: 0.28, f1@1: 0.39
test metric - num_records: 3107, precision@1: 0.496, recall@1: 0.214, f1@1: 0.299


In [54]:
# use the trained model to predict on new incoming text
k = 2
threshold = 0.1
texts = [
    'Which baking dish is best to bake a banana bread ?',
    'Why not put knives in the dishwasher?',
    'How do I cover up the white spots on my cast iron stove?'
]
batch_results = fasttext_pipeline.predict(texts, k, threshold)
batch_results

([['__label__baking', '__label__bread'],
  ['__label__equipment', '__label__cleaning'],
  ['__label__cast-iron']],
 [array([0.9875784, 0.914911 ], dtype=float32),
  array([0.9481645 , 0.15611489], dtype=float32),
  array([0.60767317], dtype=float32)])


<h1 id="Reference">Reference<a class="anchor-link" href="#Reference">¶</a></h1>



<ul>
<li><a href="https://github.com/huggingface/tokenizers/tree/master/bindings/python">Github: Tokenizers</a></li>
<li><a href="https://fasttext.cc/docs/en/supervised-tutorial.html">Fasttext Documentation: Text Classification</a></li>
<li><a href="https://www.quora.com/What-is-the-main-difference-between-word2vec-and-fastText">Quora: What is the main difference between word2vec and fastText?</a></li>
<li><a href="https://arxiv.org/abs/1607.01759">Paper: A. Joulin, E. Grave, P. Bojanowski, T. Mikolov - Bag of Tricks for Efficient Text Classification (2016)</a></li>
</ul>
