# Apply Trained Models
## Token Classifiers for Linguistic Labels

Import programming libraries:

In [1]:
import config

# Libraries for data, file, and model loading
import pandas as pd
import joblib
import os, re
import numpy as np

# Libraries for word embeddings
from gensim.models import FastText, Word2Vec
from gensim.utils import tokenize
from gensim import utils
from gensim.test.utils import get_tmpfile

# Libraries for Experiment 1 scikit-learn estimators
import sklearn.metrics
from sklearn.preprocessing import MultiLabelBinarizer
from skmultilearn.problem_transform import ClassifierChain
from sklearn.ensemble import RandomForestClassifier

# from sklearn.multiclass import OneVsRestClassifier
# from sklearn.linear_model import SGDClassifier
# from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer

### 1. Create Word Embeddings

Train custom word embeddings on metadata descriptions from the University of Edinburgh Heritage Collections' Archives catalog.

* Data file: `descriptions_by_fonds`
* Date of harvesting: October 2020
* Harvesting and transformation code: [annot-prep/PreparationForAnnotation.ipynb](https://github.com/thegoose20/annot-prep/blob/main/PreparationForAnnotation.ipynb)

References:
* https://radimrehurek.com/gensim/models/fasttext.html
* https://radimrehurek.com/gensim/auto_examples/tutorials/run_fasttext.html#sphx-glr-auto-examples-tutorials-run-fasttext-py

In [2]:
dir_path = config.inf_data_path+<BT_RAWTEXT>
file_list = os.listdir(dir_path)
print(len(file_list))

1079


In [3]:
class CorpusIterator:
    def __iter__(self):
        file_list = os.listdir(dir_path)
        for fonds_f in file_list:
            assert ".txt" in fonds_f, "All files should be Plaintext." 
            file_path = dir_path+fonds_f
            with utils.open(file_path, 'r', encoding='utf-8') as f:
                for line in f:
                    # Lowercase the tokens
                    yield list(tokenize(line.lower()))   #list(tokenize(line))

Define the hyperparameters for the unsupervised training of the fastText model:

In [4]:
# Specify training architecture (default = "cbow" for Continuous Bag of Words)
training_arch = "cbow"  #"skipgram
# Specify the learning rate (default = 0.025)
alpha = 0.025
# Specify the training objective (default = "ns")
# losses = ["ns", "hs", "softmax"]
# loss = losses[0]
# Specify the number of negative words to sample for 'ns' training objective (default = 5)
negative = 5
# Specify the threshold for downsampling higher-frequency words (default = 0.001)
sample = 0.001
# Specify the word embeddings' dimensions
vector_dimensions = 100 #50 #300
# Specify the context window (default is 5) 
context_window = 5
# Specify the number of epochs (default is 5)
epochs = 5
# Specify the threshold of word occurrences (ignore words that occur less than specified number of times; default = 5)
min_count = 5
# Specify the minimum and maximum length of character ngrams (defaults are 3 and 6)
min_n = 2
max_n = 6  # if 0, no character n-grams (subword vectors) will be used
# Specify the number of buckets for hashing ngrams (default = 2000000) 
bucket = 2000000
# Sort vocabulary by descending frequency (default = 1)
sorted_vocab = 1
# Specify the number of threads to use (default = 12)
# threads = 12

In [5]:
embedding_model = FastText(
    alpha=alpha, negative=negative, sample=sample,
    vector_size=vector_dimensions, window=context_window, 
    epochs=epochs, min_count=min_count, min_n=min_n, 
    max_n=max_n, bucket=bucket, sorted_vocab=sorted_vocab
)

In [6]:
embedding_model.build_vocab(corpus_iterable=CorpusIterator())
total_examples = embedding_model.corpus_count

In [7]:
embedding_model.train(corpus_iterable=CorpusIterator(), total_examples=total_examples, epochs=epochs)

(7321545, 10119275)

Save the model:

In [8]:
file_name = "fasttext_{a}_{d}d.model".format(a=training_arch, d=vector_dimensions)
print(file_name)

fasttext_cbow_100d.model


In [9]:
embedding_model.save("models/"+file_name)

### 2. Data Preprocessing

In [None]:
# Transform data so columns for text and field name
# Assign IDs to fields
# Tokenize text
# Part-of-speech tag text
# Assign IDs to tokens

### 3. Feature Extraction

In [102]:
# Zip the features
feature_data = list(zip(feature_list1, feature_list2))

# Make FastText feature matrix
feature_list = [embedding_model.wv[token.lower()] for token_id,token in feature_data]
X = np.array(feature_list)

### 4. Prediction with Trained Classifiers

In [104]:
# Baseline: Stereotype & Omission labels, no features
# baseline_model = "models/baseline/sgd-svm_F-tfidf_T-so.joblib"

# [Baseline] Linguistic Label
clf = "models/for_reuse/multilabel_linguistic/cc-{a}_F-fastText{d}_T-linglabels.joblib"
mlb = "models/for_reuse/multilabel_linguistic/mlb_linglabels.joblib"

# Experiment 2:
# exp2_model_file = 

# Experiment 3:
# exp3_model_file

In [105]:
trained_model = joblib.load(exp1_ling_model)   #baseline_model_file

In [106]:
predictions = trained_model.predict(X)

In [107]:
mlb = MultiLabelBinarizer()
tags = df[col]
y = mlb.fit_transform(tags)
print(y[:20])

[[0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]
 [0 0 0 1]]


In [108]:
predictions_text = mlb.inverse_transform(predictions)

In [109]:
print(predictions_text[:20])

[('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',), ('O',)]


Add the predictions to the input data:

In [110]:
df.insert(len(df.columns), "pred_"+col, predictions_text)
df.head(20)

Unnamed: 0,description_id,sentence_id,token_id,token,pos,field,token_offsets,tag,ann_id,fold,pred_tag
0,0,0,0,Identifier,NN,Identifier,"(0, 10)",[O],[99999],[split4],"(O,)"
1,0,0,1,:,:,Identifier,"(10, 11)",[O],[99999],[split4],"(O,)"
2,0,0,2,AA5,NN,Identifier,"(12, 15)",[O],[99999],[split4],"(O,)"
3,1,1,3,Title,NN,Title,"(17, 22)",[O],[99999],[split2],"(O,)"
4,1,1,4,:,:,Title,"(22, 23)",[O],[99999],[split2],"(O,)"
5,1,1,5,Papers,NNS,Title,"(24, 30)",[O],[99999],[split2],"(O,)"
6,1,1,6,of,IN,Title,"(31, 33)",[O],[99999],[split2],"(O,)"
7,1,1,7,The,DT,Title,"(34, 37)",[O],"[14384, 24275, 52952]","[split2, split2, split2]","(O,)"
8,1,1,8,Very,NNP,Title,"(38, 42)",[O],"[14384, 52952, 24275]","[split2, split2, split2]","(O,)"
9,1,1,9,Rev,NNP,Title,"(43, 46)",[O],"[14384, 24275, 26233, 52952]","[split2, split2, split2, split2]","(O,)"


In [113]:
df.pred_tag.value_counts()

(O,)    753521
Name: pred_tag, dtype: int64