[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/x1ew/UD-LinguisticStudy/blob/main/UD_LinguisticStudy.ipynb)

#***Import Data***

In [None]:
!curl --remote-name-all https://lindat.mff.cuni.cz/repository/xmlui/bitstream/handle/11234/1-5502{/ud-treebanks-v2.14.tgz,/ud-documentation-v2.14.tgz,/ud-tools-v2.14.tgz}

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  527M  100  527M    0     0  19.4M      0  0:00:27  0:00:27 --:--:-- 22.4M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  105M  100  105M    0     0  20.0M      0  0:00:05  0:00:05 --:--:-- 20.9M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  875k  100  875k    0     0  3974k      0 --:--:-- --:--:-- --:--:-- 3978k


In [None]:
%%bash

# Extract ud-treebanks-v2.14.tgz
tar -xzvf ud-treebanks-v2.14.tgz

cd ud-treebanks-v2.14

for dir in */; do
  # Check if the directory contains at least 3 .conllu files
  count=$(find "$dir" -maxdepth 1 -type f -name "*.conllu" | wc -l)

  if [ "$count" -lt 3 ]; then
    # If fewer than 3 .conllu files, remove the directory
    echo "Removing directory $dir (contains $count .conllu files)"
    rm -rf "$dir"
  fi
done

for dir in */; do
  if [ -d "$dir" ]; then
    echo "Processing directory: $dir"
    # Find all files except .conllu files and delete them
    find "$dir" -maxdepth 1 -type f ! -name "*.conllu" -exec rm -f {} \;
  fi
done


du -sh * | sort -h | awk '{print $2}' > ../normalized_dir_names.txt

ud-treebanks-v2.14/
ud-treebanks-v2.14/UD_Manx-Cadhan/
ud-treebanks-v2.14/UD_Manx-Cadhan/README.md
ud-treebanks-v2.14/UD_Manx-Cadhan/gv_cadhan-ud-train.conllu
ud-treebanks-v2.14/UD_Manx-Cadhan/LICENSE.txt
ud-treebanks-v2.14/UD_Manx-Cadhan/gv_cadhan-ud-test.txt
ud-treebanks-v2.14/UD_Manx-Cadhan/stats.xml
ud-treebanks-v2.14/UD_Manx-Cadhan/gv_cadhan-ud-train.txt
ud-treebanks-v2.14/UD_Manx-Cadhan/gv_cadhan-ud-test.conllu
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/stats.xml
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-test.conllu
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-train.conllu
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-test.txt
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/LICENSE.txt
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-dev.conllu
ud-treebanks-v2.14/UD_Old_East_Slavic-Birchbark/orv_birchbark-ud-train.txt
ud-treebanks-v2.14/UD_Old_East_S

In [None]:
# import os

# Extract ud-treebanks-v2.14.tgz
# !tar -xzvf ud-treebanks-v2.14.tgz

# Extract ud-documentation-v2.14.tgz
#!tar -xzvf ud-documentation-v2.14.tgz

# Extract ud-tools-v2.14.tgz
#!tar -xzvf ud-tools-v2.14.tgz

In [None]:
!ls

normalized_dir_names.txt  ud-documentation-v2.14.tgz  ud-treebanks-v2.14
sample_data		  ud-tools-v2.14.tgz	      ud-treebanks-v2.14.tgz


In [None]:
!pip install conllu transformers

Collecting conllu
  Downloading conllu-5.0.1-py3-none-any.whl.metadata (21 kB)
Downloading conllu-5.0.1-py3-none-any.whl (16 kB)
Installing collected packages: conllu
Successfully installed conllu-5.0.1


In [None]:
import torch

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

#***Dataset Preparation and Baseline Model Training***

##***Dataset***


In [None]:
import os
import random

def sep_file(dir_path, num_countries):
    with open('./normalized_dir_names.txt', 'r') as file:
      lang_names = [line.strip() for line in file]

    # selected_dirs = [os.path.join(dir_path, name) for name in lang_names[:num_countries]]
    selected_dirs = [os.path.join(dir_path, name) for name in lang_names[num_countries:num_countries+10]]

    l_train, l_test, l_dev = [], [], []
    for dir_name in selected_dirs:
        for file_name in os.listdir(dir_name):
            if file_name.endswith('.conllu'):
                if 'train' in file_name:
                    l_train.append(os.path.join(dir_name, file_name))
                elif 'test' in file_name:
                    l_test.append(os.path.join(dir_name, file_name))
                elif 'dev' in file_name:
                    l_dev.append(os.path.join(dir_name, file_name))

    return l_train, l_dev, l_test

In [None]:
import conllu

def prepare_data(files_dir):
  all_tokens, all_labels = [], []
  for i in files_dir:
      with open(i, 'r', encoding='utf-8') as f:
          data = conllu.parse(f.read())
          # print(data)
          for sentence in data:
              tokens = [
                  token['form'] for token in sentence
                  if token['upostag'] not in ['PUNCT', 'ADP', 'DET', 'CCONJ', 'X', '_']
                  and token['form'].strip()
              ]
              labels = [
                  token['upostag'] for token in sentence
                  if token['upostag'] not in ['PUNCT', 'ADP', 'DET', 'CCONJ', 'X', '_']
              ]
              if tokens and labels:  # both tokens and labels are non-empty
                all_tokens.append(tokens)
                all_labels.append(labels)

  return all_tokens, all_labels

In [None]:
l_train, l_dev, l_test  = sep_file(dir_path='/content/ud-treebanks-v2.14', num_countries=30)

22075

In [None]:
l_train

['/content/ud-treebanks-v2.14/UD_Uyghur-UDT/ug_udt-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Ancient_Hebrew-PTNK/hbo_ptnk-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_French-Sequoia/fr_sequoia-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Turkish-IMST/tr_imst-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Gothic-PROIEL/got_proiel-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Scottish_Gaelic-ARCOSG/gd_arcosg-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Coptic-Scriptorium/cop_scriptorium-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Latin-UDante/la_udante-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_Armenian-BSUT/hy_bsut-ud-train.conllu',
 '/content/ud-treebanks-v2.14/UD_English-LinES/en_lines-ud-train.conllu']

In [None]:
def create_mapping_dics(train_labels):
  labels = []
  for sample in train_labels:
    for tag in sample:
      if tag not in labels:
        labels.append(tag)

  label2id = {
      "O": 0,
  }
  for i in range(1, len(labels)+1):
    label2id[labels[i-1]] = i

  id2label = {val: key for key, val in label2id.items()}

  return label2id, id2label

In [None]:
def convert_labels_to_ids(labels, label2id): # labels = [0, 1]
  ids = []
  for label in labels:
    id = []
    for each_label in label:
      if each_label in label2id.keys():
        id.append(label2id[each_label])
      else:
        id.append(label2id['O'])
    ids.append(id)

  return ids

In [None]:
def tokenize_and_align_labels(sentences, labels, max_length, tokenizer):
    tokenized_inputs = tokenizer(sentences, padding='max_length', truncation=True, max_length=min(512, max_length+2), is_split_into_words=True)

    label_all_tokens = True
    new_labels = []

    for i, label in enumerate(labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        new_labels.append(label_ids)
        # print(label_ids)

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import conllu

class UD_Dataset(Dataset):
    def __init__(self, encodings):
      self.encodings = encodings

    def __len__(self):
      return len(self.encodings["input_ids"])

    def __getitem__(self, idx):
      item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
      return item # returns ([101, 21934,  4305,  3481,  4276, 102, 0, 0, 0], [-100, 6, 6, 1, 1, -100, -100, -100, -100])

In [None]:
# def load_data(l_train, l_dev, l_test, label2id):
def load_data(l_train, l_dev, l_test):
    train_tokens, train_labels = prepare_data(l_train)
    dev_tokens, dev_labels = prepare_data(l_dev)
    test_tokens, test_labels = prepare_data(l_test)

    label2id, id2label = create_mapping_dics(train_labels)
    print('label2id: ', label2id)
    print('id2label: ', id2label)

    train_label_ids = convert_labels_to_ids(train_labels, label2id)
    dev_label_ids = convert_labels_to_ids(dev_labels, label2id)
    test_label_ids = convert_labels_to_ids(test_labels, label2id)


    tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
    max_length = max([len(id) for id in train_label_ids])
    # print(max_length)

    train_encodings = tokenize_and_align_labels(train_tokens, train_label_ids, max_length, tokenizer)
    dev_encodings = tokenize_and_align_labels(dev_tokens, dev_label_ids, max_length, tokenizer)
    test_encodings = tokenize_and_align_labels(test_tokens, test_label_ids, max_length, tokenizer)

    train_dataset = UD_Dataset(train_encodings)
    print(train_dataset[0])
    dev_dataset = UD_Dataset(dev_encodings)
    test_dataset = UD_Dataset(test_encodings)

    return train_dataset, dev_dataset, test_dataset, train_tokens, dev_tokens, test_tokens, label2id, id2label

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel, BertTokenizerFast, AutoTokenizer

# train_dataset, dev_dataset, test_dataset, label2id, id2label = load_data(l_train, l_dev, l_test, label2id)
train_dataset, dev_dataset, test_dataset, train_tokens, dev_tokens, test_tokens, label2id, id2label = load_data(l_train, l_dev, l_test)

len(train_dataset), len(dev_dataset), len(test_dataset)

label2id:  {'O': 0, 'NOUN': 1, 'NUM': 2, 'VERB': 3, 'PRON': 4, 'INTJ': 5, 'ADV': 6, 'ADJ': 7, 'AUX': 8, 'PROPN': 9, 'PART': 10, 'SCONJ': 11, 'SYM': 12}
id2label:  {0: 'O', 1: 'NOUN', 2: 'NUM', 3: 'VERB', 4: 'PRON', 5: 'INTJ', 6: 'ADV', 7: 'ADJ', 8: 'AUX', 9: 'PROPN', 10: 'PART', 11: 'SCONJ', 12: 'SYM'}
{'input_ids': tensor([  101,   100,   100,  1300, 29837, 23673, 15394, 25573,   100,   100,
         1300, 29837, 23673, 15394, 25573,   100,   100,   100,   100,   102,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,  

(21701, 6740, 6884)

##***Model***


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten the predictions and labels to ignore padding (-100)
    true_labels = []
    true_preds = []

    for label, pred in zip(labels.flatten(), preds.flatten()):
        if label != -100:
            true_labels.append(label)
            true_preds.append(pred)

    accuracy = accuracy_score(true_labels, true_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label2id)+1)  # Adjust num_labels as needed
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

batch_size = 8

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5476,0.621758,0.776673,0.78205,0.776673,0.773964
2,0.4689,0.538874,0.81056,0.817482,0.81056,0.808137
3,0.4316,0.527149,0.827918,0.833603,0.827918,0.828375
4,0.29,0.555555,0.831697,0.836229,0.831697,0.831736


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5476,0.621758,0.776673,0.78205,0.776673,0.773964
2,0.4689,0.538874,0.81056,0.817482,0.81056,0.808137
3,0.4316,0.527149,0.827918,0.833603,0.827918,0.828375
4,0.29,0.555555,0.831697,0.836229,0.831697,0.831736
5,0.2275,0.591091,0.837387,0.841295,0.837387,0.837194


TrainOutput(global_step=13565, training_loss=0.3896432831504448, metrics={'train_runtime': 2326.9283, 'train_samples_per_second': 46.63, 'train_steps_per_second': 5.83, 'total_flos': 4569594615303300.0, 'train_loss': 0.3896432831504448, 'epoch': 5.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.6024723649024963,
 'eval_accuracy': 0.8432245475047031,
 'eval_precision': 0.8474981288091815,
 'eval_recall': 0.8432245475047031,
 'eval_f1': 0.8431260539738633,
 'eval_runtime': 37.7603,
 'eval_samples_per_second': 182.308,
 'eval_steps_per_second': 22.802,
 'epoch': 5.0}

In [None]:
def show_preds_and_labels(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    true_preds = [
      [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
      for prediction, label in zip(preds, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(preds, labels)
    ]

    return true_preds, true_labels

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def random_sample_results(tokens, dataset, n=5):
  random_indices = random.sample(range(len(dataset)), n)
  sentences = [tokens[i] for i in random_indices]
  tokens = [tokenizer.convert_ids_to_tokens(dataset[i]['input_ids']) for i in random_indices]

  x = trainer.predict([dataset[i] for i in random_indices])
  true_preds, true_labels = show_preds_and_labels(x)

  for i, (sentence, token, pred, true) in enumerate(zip(sentences, tokens, true_preds, true_labels)):
    print(f"Sample {i+1}:")
    print('ORG:       ', sentence)
    print('Tokenized: ', [tk for tk in token if tk != '[CLS]' and tk != '[PAD]' and tk != '[SEP]'])
    print('Pred:      ', pred)
    print('truth:     ', true)
    print()

In [None]:
random_sample_results(test_tokens, test_dataset)

Sample 1:
ORG:        ['sunt', 'fere', 'cantionum', 'inventores', 'qui', 'stantia', 'carmen', 'incomitatum', 'relinquunt', 'quin', 'sibi', 'rithimi', 'concrepantiam', 'reddant']
Tokenized:  ['sun', '##t', 'fe', '##re', 'can', '##tion', '##um', 'inventor', '##es', 'qui', 'stan', '##tia', 'carmen', 'inc', '##omi', '##tat', '##um', 're', '##lin', '##qu', '##unt', 'qui', '##n', 'si', '##bi', 'ri', '##thi', '##mi', 'con', '##cre', '##pan', '##tia', '##m', 'red', '##dant']
Pred:       ['AUX', 'AUX', 'ADV', 'ADV', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'VERB', 'SCONJ', 'SCONJ', 'PRON', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'VERB']
truth:      ['AUX', 'AUX', 'ADV', 'ADV', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'ADJ', 'ADJ', 'ADJ', 'VERB', 'VERB', 'VERB', 'VERB', 'SCONJ', 'SCONJ', 'PRON', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOUN'

In [None]:
rm logs -r

#***Model Adjustment and Partial Freezing***

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    # Flatten the predictions and labels to ignore padding (-100)
    true_labels = []
    true_preds = []

    for label, pred in zip(labels.flatten(), preds.flatten()):
        if label != -100:
            true_labels.append(label)
            true_preds.append(pred)

    accuracy = accuracy_score(true_labels, true_preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, true_preds, average='weighted')

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

In [None]:
from transformers import DistilBertTokenizerFast, DistilBertForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification

model = DistilBertForTokenClassification.from_pretrained('distilbert-base-uncased', num_labels=len(label2id)+1)  # Adjust num_labels as needed
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

batch_size = 8

for param in model.distilbert.embeddings.parameters():
    param.requires_grad = False

# Freeze the first 4 layers
for param in model.distilbert.transformer.layer[:4].parameters():
    param.requires_grad = False

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.8801,0.870613,0.663736,0.672899,0.663736,0.653944
2,0.7423,0.726923,0.719536,0.729147,0.719536,0.713442
3,0.642,0.676921,0.745847,0.75228,0.745847,0.741745
4,0.5789,0.642425,0.760317,0.765365,0.760317,0.758593
5,0.5099,0.630568,0.766191,0.769941,0.766191,0.764041


  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=13565, training_loss=0.7004151577357213, metrics={'train_runtime': 1394.3028, 'train_samples_per_second': 77.82, 'train_steps_per_second': 9.729, 'total_flos': 4569594615303300.0, 'train_loss': 0.7004151577357213, 'epoch': 5.0})

In [None]:
trainer.evaluate(eval_dataset=test_dataset)

{'eval_loss': 0.6348324418067932,
 'eval_accuracy': 0.7739509937953518,
 'eval_precision': 0.778746233678203,
 'eval_recall': 0.7739509937953518,
 'eval_f1': 0.7724361225287745,
 'eval_runtime': 37.162,
 'eval_samples_per_second': 185.243,
 'eval_steps_per_second': 23.169,
 'epoch': 5.0}

In [None]:
random_sample_results(test_tokens, test_dataset)

Sample 1:
ORG:        ['sunt', 'fere', 'cantionum', 'inventores', 'qui', 'stantia', 'carmen', 'incomitatum', 'relinquunt', 'quin', 'sibi', 'rithimi', 'concrepantiam', 'reddant']
Tokenized:  ['sun', '##t', 'fe', '##re', 'can', '##tion', '##um', 'inventor', '##es', 'qui', 'stan', '##tia', 'carmen', 'inc', '##omi', '##tat', '##um', 're', '##lin', '##qu', '##unt', 'qui', '##n', 'si', '##bi', 'ri', '##thi', '##mi', 'con', '##cre', '##pan', '##tia', '##m', 'red', '##dant']
Pred:       ['AUX', 'AUX', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'VERB', 'SCONJ', 'SCONJ', 'PRON', 'PRON', 'VERB', 'NOUN', 'NOUN', 'VERB', 'VERB', 'VERB', 'NOUN', 'NOUN', 'VERB', 'VERB']
truth:      ['AUX', 'AUX', 'ADV', 'ADV', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'NOUN', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'ADJ', 'ADJ', 'ADJ', 'ADJ', 'VERB', 'VERB', 'VERB', 'VERB', 'SCONJ', 'SCONJ', 'PRON', 'PRON', 'NOUN', 'NOUN', 'NOUN', 'NOU