In [7]:
import torch
import torch.nn.functional as F

from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

from transformers.optimization import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

# Assume `model` is your BART model and `tokenizer` is your BART tokenizer
input_text = "Input text goes here."
target_summary = "Desired summary goes here."

input_ids = tokenizer.encode(input_text, return_tensors="pt", padding='max_length', max_length=20, truncation=True)
target_ids = tokenizer.encode(target_summary, return_tensors="pt", padding='max_length', max_length=20, truncation=True)

# Generate model predictions
output_probs = model(input_ids).logits

# Compute negative log-likelihood loss
loss = F.nll_loss(output_probs.view(-1, output_probs.shape[-1]), target_ids.view(-1))

# Backpropagate and update model parameters
loss.backward()
optimizer.step()



In [10]:
loss.item()

-1.435591697692871

In [49]:
import torch
import torch.nn.functional as F

from transformers import BartForConditionalGeneration, BartTokenizer

model = BartForConditionalGeneration.from_pretrained("facebook/bart-large", forced_bos_token_id=0)
tokenizer = BartTokenizer.from_pretrained("facebook/bart-large")

from transformers.optimization import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)

# Assume `model` is your BART model and `tokenizer` is your BART tokenizer
input_text = "Input text goes here."
target_summary = "Desired summary goes here."

input_ids = tokenizer.encode(input_text, return_tensors="pt", padding='max_length', max_length=20, truncation=True)
target_ids = tokenizer.encode(target_summary, return_tensors="pt", padding='max_length', max_length=20, truncation=True)

# Generate model predictions
output_logits = model(input_ids).logits
output_probs = torch.nn.functional.log_softmax(output_logits, dim=-1)
output_probs = output_probs.view(-1, model.config.vocab_size)

# Compute negative log-likelihood loss
nll_loss = F.nll_loss(output_probs.view(-1, output_probs.shape[-1]), target_ids.view(-1))

# Compute MSE length loss
generated_summary_length = len(output_probs[0])  # Length of the generated summary
mse_length_loss = F.mse_loss(torch.tensor([generated_summary_length], dtype=torch.float), 
                             torch.tensor([target_length], dtype=torch.float))

# Combine the losses (you can adjust the weighting between the two losses)
lambda_nll = 1.0  # Weight for the NLL loss
lambda_mse = 0.01  # Weight for the MSE length loss
combined_loss = lambda_nll * nll_loss + lambda_mse * mse_length_loss

# Backpropagate and update model parameters
combined_loss.backward()
optimizer.step()

In [59]:
model.config.vocab_size

50265

In [50]:
output_logits

tensor([[[ 17.4297,  -1.5130,  13.2232,  ...,  -2.0266,  -2.3495,   6.0859],
         [ 17.4297,  -1.5130,  13.2232,  ...,  -2.0266,  -2.3495,   6.0859],
         [-11.1451,  -4.1054,  10.1617,  ...,  -4.7435,  -4.6772,  -3.2661],
         ...,
         [ -5.9960,  -1.9598,  13.1244,  ...,   0.0651,  -1.0115,   2.3164],
         [ -5.7085,  -1.9905,  13.2075,  ...,   0.1223,  -0.9930,   2.3492],
         [ -6.1833,  -2.0719,  13.1411,  ...,   0.1684,  -1.0041,   2.2347]]],
       grad_fn=<AddBackward0>)

In [51]:
output_probs

tensor([[ -0.4539, -19.3966,  -4.6604,  ..., -19.9102, -20.2331, -11.7977],
        [ -0.4539, -19.3966,  -4.6604,  ..., -19.9102, -20.2331, -11.7977],
        [-25.5730, -18.5333,  -4.2663,  ..., -19.1714, -19.1051, -17.6941],
        ...,
        [-21.2853, -17.2491,  -2.1649,  ..., -15.2242, -16.3008, -12.9729],
        [-21.0157, -17.2978,  -2.0997,  ..., -15.1849, -16.3002, -12.9580],
        [-21.4130, -17.3016,  -2.0886,  ..., -15.0613, -16.2338, -12.9950]],
       grad_fn=<ViewBackward0>)

In [52]:
target_ids

tensor([[    0, 28324,  7651,  4819,  1411,   259,     4,     2,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1]])

In [56]:
gt_logits = target_ids[0]
gt_logits

tensor([    0, 28324,  7651,  4819,  1411,   259,     4,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])

In [57]:
gt_logits = gt_logits.view(-1)
gt_logits

tensor([    0, 28324,  7651,  4819,  1411,   259,     4,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1])

In [45]:
nll_loss

tensor(13.9941, grad_fn=<NllLossBackward0>)

In [41]:
output_probs

tensor([[[ -0.4539, -19.3966,  -4.6604,  ..., -19.9102, -20.2331, -11.7977],
         [ -0.4539, -19.3966,  -4.6604,  ..., -19.9102, -20.2331, -11.7977],
         [-25.5730, -18.5333,  -4.2663,  ..., -19.1714, -19.1051, -17.6941],
         ...,
         [-21.2853, -17.2491,  -2.1649,  ..., -15.2242, -16.3008, -12.9729],
         [-21.0157, -17.2978,  -2.0997,  ..., -15.1849, -16.3002, -12.9580],
         [-21.4130, -17.3016,  -2.0886,  ..., -15.0613, -16.2338, -12.9950]]],
       grad_fn=<LogSoftmaxBackward0>)

In [30]:
generated_summary_length

20

In [31]:
mse_length_loss

tensor(100.)

In [32]:
loss.item()

-1.435591697692871

In [33]:
combined_loss

tensor(-0.4356, grad_fn=<AddBackward0>)

In [16]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /home/worachotn/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/worachotn/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/worachotn/nltk_data...


True

In [17]:
from nltk.corpus import wordnet

lemmatizer = nltk.stem.WordNetLemmatizer()  # Initiate nltk lemmatizer

In [18]:
def nltk_to_pos(pos):
    """ Simple function for converting nltk pos to wordnet pos"""
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('N'):
        return wordnet.NOUN
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return None

In [55]:
def simple_tokenize(sentence):
    """ Simple function for tokenizing text with nltk """
    return nltk.word_tokenize(sentence)

In [19]:
def lemmatize_text(text):
    """ Function to lemmatize text according to the wordnet POS of each token """

    tokenized_text = nltk.word_tokenize(text)
    POS_assigned_text = nltk.pos_tag(tokenized_text)

    available_POS = map(lambda x: (x[0], nltk_to_pos(x[1])), POS_assigned_text)

    lemmatized_text = [token if pos is None
                       else lemmatizer.lemmatize(token, pos)
                       for token, pos in available_POS]

    return lemmatized_text

In [97]:
data = {
    'article': ['#Person1#: Ms. Dawson, I need you to take a dictation for me.\n#Person2#: Yes, sir...\n#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?\n',
               '#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?\n#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.\n'],
    'topic': ['intra-office', 'communications']
}

In [98]:
data['article']

['#Person1#: Ms. Dawson, I need you to take a dictation for me.\n#Person2#: Yes, sir...\n#Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?\n',
 '#Person2#: Sir, does this apply to intra-office communications only? Or will it also restrict external communications?\n#Person1#: It should apply to all communications, not only in this office between employees, but also any outside communications.\n']

In [99]:
lemmatized_tokens = []

In [100]:
for i in range(len(data['article'])):
    # print(data['article'][i])
    lemmatized_tokens.append(lemmatize_text(data['article'][i]))

In [101]:
print(lemmatized_tokens)

[['#', 'Person1', '#', ':', 'Ms.', 'Dawson', ',', 'I', 'need', 'you', 'to', 'take', 'a', 'dictation', 'for', 'me', '.', '#', 'Person2', '#', ':', 'Yes', ',', 'sir', '...', '#', 'Person1', '#', ':', 'This', 'should', 'go', 'out', 'as', 'an', 'intra-office', 'memorandum', 'to', 'all', 'employee', 'by', 'this', 'afternoon', '.', 'Are', 'you', 'ready', '?'], ['#', 'Person2', '#', ':', 'Sir', ',', 'do', 'this', 'apply', 'to', 'intra-office', 'communication', 'only', '?', 'Or', 'will', 'it', 'also', 'restrict', 'external', 'communication', '?', '#', 'Person1', '#', ':', 'It', 'should', 'apply', 'to', 'all', 'communication', ',', 'not', 'only', 'in', 'this', 'office', 'between', 'employee', ',', 'but', 'also', 'any', 'outside', 'communication', '.']]


In [102]:
original_tokens = []

In [103]:
for i in range(len(data['article'])):
    # print(data['article'][i])
    original_tokens.append(simple_tokenize(data['article'][i]))

In [104]:
# data['lemmatize'] = data['article'].apply(lambda x: lemmatize_text(x))

In [105]:
# lemmatized_tokens = lemmatize_text('#Person1#: Ms. Dawson, I need you to take a dictation for me.\n #Person2#: Yes, sir...\n #Person1#: This should go out as an intra-office memorandum to all employees by this afternoon. Are you ready?\n')

In [106]:
for i in range(len(lemmatized_tokens)):
    print(i)
    break

0


In [107]:
original_list

['#',
 'Person2',
 '#',
 ':',
 'Sir',
 ',',
 'does',
 'this',
 'apply',
 'to',
 'intra-office',
 '[TAG]communications[TAG]',
 'only',
 '?',
 'Or',
 'will',
 'it',
 'also',
 'restrict',
 'external',
 '[TAG]communications[TAG]',
 '?',
 '#',
 'Person1',
 '#',
 ':',
 'It',
 'should',
 'apply',
 'to',
 'all',
 '[TAG]communications[TAG]',
 ',',
 'not',
 'only',
 'in',
 'this',
 'office',
 'between',
 'employees',
 ',',
 'but',
 'also',
 'any',
 'outside',
 '[TAG]communications[TAG]',
 '.']

In [108]:
tagged_tokens = []

In [109]:
for i in range(len(lemmatized_tokens)):
    if data['topic'][i] not in data['article'][i]:
        raise ValueError("Topic " + str(assigned_topics[i]) + "is not included in topic file.")

    # Extract all the seed words according to the corresponding topic
    token_topics = data['topic'][i]
    print(token_topics)
    original_list = original_tokens[i]

    for j, token in enumerate(lemmatized_tokens[i]):
        # If the lemmatized form of the token is in topic seeds, tag the original token
        if token.lower() in token_topics:
            print(token.lower())
            original_list[j] = '[TAG]' + original_list[j] + '[TAG]'

    tagged_tokens.append(" ".join(original_list))

intra-office
i
a
intra-office
communications
communication
communication
communication
communication


In [110]:
tagged_tokens

['# Person1 # : Ms. Dawson , [TAG]I[TAG] need you to take [TAG]a[TAG] dictation for me . # Person2 # : Yes , sir ... # Person1 # : This should go out as an [TAG]intra-office[TAG] memorandum to all employees by this afternoon . Are you ready ?',
 '# Person2 # : Sir , does this apply to intra-office [TAG]communications[TAG] only ? Or will it also restrict external [TAG]communications[TAG] ? # Person1 # : It should apply to all [TAG]communications[TAG] , not only in this office between employees , but also any outside [TAG]communications[TAG] .']