In [39]:
from tokenizer import BertTokenizer
import nltk

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', local_files_only=True)

# Load the NLTK POS tagger
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Example sentence
sentence = "The quick brown fox jumps over the lazy dog."

# Tokenize the sentence using the BERT tokenizer
encoding = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)

# Convert input_ids to tokens
tokens = tokenizer.convert_ids_to_tokens(encoding['input_ids'][0].tolist())

# Convert tokens to strings
token_strings = [token if token != '[PAD]' else '' for token in tokens]

# Perform POS tagging using NLTK
pos_tags = nltk.pos_tag(token_strings)

# Create an array of POS tags corresponding to each input token
pos_tags_array = [tag for _, tag in pos_tags]

print(pos_tags_array)


['IN', 'DT', 'JJ', 'NN', 'NN', 'VBZ', 'IN', 'DT', 'JJ', 'NN', '.', 'NN']


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Larsk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Larsk\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [41]:
encoding["input_ids"].shape, len(pos_tags_array)

(torch.Size([1, 12]), 12)

In [8]:
sentence1 = "Hello"
sentence2 = "Hello World"
sentence3 = "Hello World!"
sentence4 = "How are you today? Isn't it a nice day?"

In [15]:
encoding = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)

In [16]:
encoding

{'input_ids': tensor([[ 101, 7592, 2088,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1]])}

In [43]:
nltk.download('tagsets')
# Get all the POS tags from NLTK
pos_tags_nltk = set([tag for tag, _ in nltk.data.load('help/tagsets/upenn_tagset.pickle').items()])

# Create a vocabulary dictionary for POS tags
pos_tag_vocab = {tag: index for index, tag in enumerate(pos_tags_nltk)}

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\Larsk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping help\tagsets.zip.


In [44]:
pos_tag_vocab

{'JJR': 0,
 ':': 1,
 'VBZ': 2,
 'RBR': 3,
 'IN': 4,
 'FW': 5,
 'LS': 6,
 '--': 7,
 'WP': 8,
 'SYM': 9,
 'NN': 10,
 'PRP': 11,
 'RP': 12,
 'VBN': 13,
 'VBP': 14,
 '$': 15,
 'RBS': 16,
 'VBG': 17,
 'PRP$': 18,
 'PDT': 19,
 ')': 20,
 'UH': 21,
 'RB': 22,
 'VB': 23,
 'EX': 24,
 "''": 25,
 '.': 26,
 'WRB': 27,
 'WP$': 28,
 ',': 29,
 'DT': 30,
 'NNP': 31,
 '``': 32,
 'POS': 33,
 'JJS': 34,
 'WDT': 35,
 'VBD': 36,
 'MD': 37,
 'TO': 38,
 'NNS': 39,
 'JJ': 40,
 'NNPS': 41,
 '(': 42,
 'CC': 43,
 'CD': 44}