In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.utils.data import DataLoader

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv('/kaggle/input/economic-documents/english_economic_docs.csv')
df = df.dropna(subset=['text'])

def preprocess_text(text):
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', '', text)
    text = text.strip()
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)
corpus = df['cleaned_text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(X.T.toarray(), index=words, columns=[f"Doc {i+1}" for i in range(len(corpus))])

df_tfidf

Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 1386,Doc 1387,Doc 1388,Doc 1389,Doc 1390,Doc 1391,Doc 1392,Doc 1393,Doc 1394,Doc 1395
00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
000,0.0,0.0,0.018049,0.0,0.0,0.0,0.000963,0.0,0.009711,0.006896,...,0.0,0.038235,0.007421,0.0,0.0,0.0,0.012977,0.0,0.0184,0.0
0001,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
0002,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
000800,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
청산리방법,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
한국은행,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
햄연지,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
현지지도,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0


In [4]:
top_words_list = []

for doc_index in range(len(corpus)):
    important_words = df_tfidf.iloc[:, doc_index]
    vocab_size = (important_words > 0).sum()
    
    N = round(vocab_size * 0.25)
    
    top_important_words = important_words.sort_values(ascending=False).head(N)
    
    rounded_scores = [round(score, 4) for score in top_important_words.values.tolist()]
    
    top_words_list.append({
        "Document": f"Doc {doc_index + 1}",
        "Vocabulary Size": vocab_size,
        "N": N,
        "Top N Words": top_important_words.index.tolist(),
        "TF-IDF Scores": rounded_scores
    })

top_words_df = pd.DataFrame(top_words_list)

top_words_df

Unnamed: 0,Document,Vocabulary Size,N,Top N Words,TF-IDF Scores
0,Doc 1,156,39,"[university, hcm, vietnam, of, vnu, law, and, ...","[0.3857, 0.291, 0.268, 0.2576, 0.2256, 0.2085,..."
1,Doc 2,1102,276,"[the, buyout, of, buyouts, leveraged, equity, ...","[0.4737, 0.3434, 0.2894, 0.2842, 0.243, 0.2122..."
2,Doc 3,1836,459,"[the, scammer, victim, scam, to, scammers, sca...","[0.4544, 0.3612, 0.3058, 0.2653, 0.2137, 0.199..."
3,Doc 4,451,113,"[the, company, traded, publicly, of, public, s...","[0.4355, 0.3394, 0.2757, 0.2141, 0.1949, 0.179..."
4,Doc 5,99,25,"[vnpt, the, vietnam, vinasat, vietnamese, of, ...","[0.7007, 0.2143, 0.214, 0.2116, 0.1568, 0.1288..."
...,...,...,...,...,...
1390,Doc 1391,227,57,"[businesswomen, she, businesswoman, asia, in, ...","[0.3283, 0.3239, 0.3107, 0.2886, 0.2529, 0.248..."
1391,Doc 1392,723,181,"[reclaimed, reclamation, the, land, of, sq, mi...","[0.373, 0.3409, 0.3313, 0.3272, 0.2309, 0.1613..."
1392,Doc 1393,78,20,"[win, game, scenario, longmans, lose, follett,...","[0.6673, 0.3668, 0.2591, 0.1506, 0.1467, 0.136..."
1393,Doc 1394,1658,414,"[madoff, the, of, he, his, and, to, in, was, f...","[0.8426, 0.2317, 0.1277, 0.1277, 0.1267, 0.125..."


In [5]:
labeled_sentences = []

for doc_index in range(len(corpus)):
    document = corpus[doc_index]
    
    sentences = document.split('.')

    for sentence in sentences:
        if sentence == []: continue
        tokens = sentence.split()
        
        top_important_words = top_words_df.iloc[doc_index]["Top N Words"]
        top_important_score = top_words_df.iloc[doc_index]["TF-IDF Scores"]
        
        sentence_labels = []
        for token in tokens:
            token_lower = token.lower()

            if token_lower in stop_words:
                sentence_labels.append(0)
            elif token_lower in top_important_words:
                token_index = top_important_words.index(token_lower)
                sentence_labels.append(top_important_score[token_index])
            else:
                sentence_labels.append(0)
        
        labeled_sentences.append({
            "text": sentence,
            "tokens": tokens,
            "labels_not_process": sentence_labels
        })

labeled_df = pd.DataFrame(labeled_sentences)
labeled_df = labeled_df[labeled_df['tokens'].apply(lambda x: x != [])]
labeled_df

Unnamed: 0,text,tokens,labels_not_process
0,English The University of Economics and Law (U...,"[English, The, University, of, Economics, and,...","[0, 0, 0.3857, 0, 0.1433, 0, 0.2085, 0, 0, 0.0..."
1,It is a member institution of university of V...,"[It, is, a, member, institution, of, universit...","[0, 0, 0, 0, 0.1081, 0, 0.3857, 0, 0.268, 0, 0..."
2,It was previously known as VNU-HCM Faculty of...,"[It, was, previously, known, as, VNU-HCM, Facu...","[0, 0, 0, 0, 0, 0, 0, 0, 0.1433, 0, 0, 0, 0, 0..."
3,To be able to meet the demands of socio-econo...,"[To, be, able, to, meet, the, demands, of, soc...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.268, 0, 0,..."
4,As one of the renowned and well-known univers...,"[As, one, of, the, renowned, and, well-known, ...","[0, 0, 0, 0, 0, 0, 0, 0.0847, 0, 0, 0, 0.3857,..."
...,...,...,...
168051,He had two sons and a daughter from an earlie...,"[He, had, two, sons, and, a, daughter, from, a...","[0, 0, 0, 0.0283, 0, 0, 0, 0, 0, 0, 0]"
168052,One of his sons had predeceased him in a car ...,"[One, of, his, sons, had, predeceased, him, in...","[0, 0, 0, 0.0283, 0, 0.0225, 0, 0, 0, 0, 0]"
168053,"Mundell died on April 4, 2021, from cholangio...","[Mundell, died, on, April, 4,, 2021,, from, ch...","[0.5927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.02..."
168054,He was aged 89,"[He, was, aged, 89]","[0, 0, 0, 0]"


In [6]:
def softmax_with_zeros(x):
    non_zero_values = [xi for xi in x if xi != 0]
    if not non_zero_values: return x
    softmax_values = np.exp(non_zero_values - np.max(non_zero_values))
    softmax_values = softmax_values / softmax_values.sum()
    result = []
    non_zero_idx = 0
    for xi in x:
        if xi != 0:
            result.append(softmax_values[non_zero_idx])
            non_zero_idx += 1
        else:
            result.append(0)
    return result

labeled_df['labels'] = labeled_df['labels_not_process'].apply(lambda x: softmax_with_zeros(x))
labeled_df[['tokens', 'labels_not_process', 'labels']]

Unnamed: 0,tokens,labels_not_process,labels
0,"[English, The, University, of, Economics, and,...","[0, 0, 0.3857, 0, 0.1433, 0, 0.2085, 0, 0, 0.0...","[0, 0, 0.05456938432184652, 0, 0.0428228996809..."
1,"[It, is, a, member, institution, of, universit...","[0, 0, 0, 0, 0.1081, 0, 0.3857, 0, 0.268, 0, 0...","[0, 0, 0, 0, 0.1533325610874464, 0, 0.20239255..."
2,"[It, was, previously, known, as, VNU-HCM, Facu...","[0, 0, 0, 0, 0, 0, 0, 0, 0.1433, 0, 0, 0, 0, 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0.43969499349347674, ..."
3,"[To, be, able, to, meet, the, demands, of, soc...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.268, 0, 0,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1506442652..."
4,"[As, one, of, the, renowned, and, well-known, ...","[0, 0, 0, 0, 0, 0, 0, 0.0847, 0, 0, 0, 0.3857,...","[0, 0, 0, 0, 0, 0, 0, 0.11410845714254773, 0, ..."
...,...,...,...
168051,"[He, had, two, sons, and, a, daughter, from, a...","[0, 0, 0, 0.0283, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 1.0, 0, 0, 0, 0, 0, 0, 0]"
168052,"[One, of, his, sons, had, predeceased, him, in...","[0, 0, 0, 0.0283, 0, 0.0225, 0, 0, 0, 0, 0]","[0, 0, 0, 0.5014499959351804, 0, 0.49855000406..."
168053,"[Mundell, died, on, April, 4,, 2021,, from, ch...","[0.5927, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.02...","[0.4693018645628851, 0, 0, 0, 0, 0, 0, 0, 0, 0..."
168054,"[He, was, aged, 89]","[0, 0, 0, 0]","[0, 0, 0, 0]"


In [7]:
from transformers import AutoTokenizer, AutoModel, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

class BERTWeighted(nn.Module):
    def __init__(self, pretrained_model_name='bert-base-uncased'):
        super(BERTWeighted, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.linear = nn.Linear(self.bert.config.hidden_size, 1)
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state
        word_weights = self.linear(embeddings).squeeze(-1)
        word_weights = word_weights.masked_fill(attention_mask == 0, float('-inf'))
        batch_size, seq_len = attention_mask.shape
        for i in range(batch_size):
            mask_indices = attention_mask[i].nonzero(as_tuple=True)[0]
            first_pos = mask_indices[0]
            last_pos = mask_indices[-1]
            word_weights[i, first_pos] = float('-inf')
            word_weights[i, last_pos] = float('-inf')
        word_weights = torch.nn.functional.softmax(word_weights, dim=-1)
        return word_weights

model = BERTWeighted('bert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERTWeighted(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_

In [9]:
data = []

for index, row in labeled_df.iterrows():
    data.append({
        'tokens': row['tokens'],
        'labels': row['labels']
    })

dataset = Dataset.from_pandas(labeled_df)
split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [10]:
def tokenize_and_align_labels(row):
    tokenized_inputs = tokenizer(row['tokens'], padding='max_length', max_length=128, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    
    labels = row['labels']
    aligned_labels = []
    
    for i, word_id in enumerate(word_ids):
        if word_id is None:
            aligned_labels.append(0)
        else:
            aligned_labels.append(labels[word_id])

    if len(aligned_labels) > 128:
        aligned_labels = aligned_labels[:128]
    elif len(aligned_labels) < 128:
        aligned_labels = aligned_labels + [0] * (128 - len(aligned_labels))
    
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
train_tokenized_dataset

Map:   0%|          | 0/132277 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels_not_process', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 132277
})

In [11]:
eval_tokenized_dataset = eval_dataset.map(tokenize_and_align_labels, batched=False)
eval_tokenized_dataset

Map:   0%|          | 0/33070 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels_not_process', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33070
})

In [12]:
def collate_fn(batch):
    input_ids = torch.tensor([item["input_ids"] for item in batch], dtype=torch.long)
    attention_mask = torch.tensor([item["attention_mask"] for item in batch], dtype=torch.long)
    labels = torch.tensor([item["labels"] for item in batch], dtype=torch.float)
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

train_dataloader = DataLoader(train_tokenized_dataset, batch_size=128, collate_fn=collate_fn)
eval_dataloader = DataLoader(eval_tokenized_dataset, batch_size=128, collate_fn=collate_fn)

In [13]:
print("Start training")

model.train()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

try:
    for epoch in range(5):
        total_loss = 0
        for batch in train_dataloader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].squeeze().to(device)
            attention_mask = batch["attention_mask"].squeeze().to(device)
            labels = batch["labels"].to(device)

            word_weights = model(input_ids, attention_mask)
            epsilon = 1e-8
            word_weights = word_weights + epsilon
            loss = F.kl_div(torch.log(word_weights), labels, reduction='batchmean')

            loss.backward()
            optimizer.step()

            total_loss += loss.item()
        avg_loss = total_loss / len(train_dataloader)
        print(f"Epoch {epoch + 1}, Loss: {avg_loss}")
    
except Exception as e:
    print(f"An error occurred: {e}")

finally:
    torch.save(model.state_dict(), "/kaggle/working/roberta_token_weight.pth")
    print("Model weights saved.")

Start training
Epoch 1, Loss: 0.7878018331712173
Epoch 2, Loss: 0.7019639829141259
Epoch 3, Loss: 0.6826295789251936
Epoch 4, Loss: 0.6658562721839495
Epoch 5, Loss: 0.6486146280855459
Model weights saved.


In [14]:
model.load_state_dict(torch.load("/kaggle/working/roberta_token_weight.pth"))
model.to(device)
model.eval()
    
total_loss = 0.0
with torch.no_grad():
    for batch in eval_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        word_weights = model(input_ids, attention_mask)
        epsilon = 1e-8
        word_weights = word_weights + epsilon
        loss = F.kl_div(torch.log(word_weights), labels, reduction='batchmean')
        total_loss += loss.item()

avg_loss = total_loss / len(eval_dataloader)
print(f"Evaluation Loss: {avg_loss}")

  model.load_state_dict(torch.load("/kaggle/working/roberta_token_weight.pth"))


Evaluation Loss: 0.694877955213937


In [15]:
model.load_state_dict(torch.load("/kaggle/working/roberta_token_weight.pth"))
model.to(device)
model.eval()

test_sentences = [
    "This is the worst university in Vietnam",
    "The price of oil affects many products",
    "People buy goods when they have more money",
    "A job loss can hurt the economy",
    "Banks lend money to help businesses grow",
    "Inflation makes prices go up",
    "Workers earn wages for their labor",
    "Saving money is important for the future",
    "Taxes help pay for government services",
    "When the economy grows, people can find more jobs",
    "Demand for food goes up during holidays"
]

for test_sentence in test_sentences:
    test_tokens = test_sentence.split()
    inputs = tokenizer(test_tokens, truncation=True, padding='max_length', max_length=128, return_tensors="pt", is_split_into_words=True)
    inputs = inputs.to(device)

    with torch.no_grad():
        word_weights = model(inputs['input_ids'], inputs['attention_mask'])

        tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
        
        print(f"Testing Sentence: {test_sentence}")
        predicted_sum = 0
        for token, weight in zip(tokens, word_weights[0].cpu().numpy()):
            if token != '[PAD]':
                print(f"Token: {token}, Prediction: {weight}")
                predicted_sum += weight
        print(f"Sum of Predicted Weights: {predicted_sum}")
        print("\n" + "-"*50 + "\n")

  model.load_state_dict(torch.load("/kaggle/working/roberta_token_weight.pth"))


Testing Sentence: This is the worst university in Vietnam
Token: [CLS], Prediction: 0.0
Token: this, Prediction: 3.1623756058252184e-06
Token: is, Prediction: 2.034253611782333e-06
Token: the, Prediction: 1.7877973732538521e-06
Token: worst, Prediction: 0.07219025492668152
Token: university, Prediction: 0.32257142663002014
Token: in, Prediction: 2.1872153865842847e-06
Token: vietnam, Prediction: 0.6052290797233582
Token: [SEP], Prediction: 0.0
Sum of Predicted Weights: 0.9999999329220373

--------------------------------------------------

Testing Sentence: The price of oil affects many products
Token: [CLS], Prediction: 0.0
Token: the, Prediction: 1.343196231573529e-06
Token: price, Prediction: 0.21979159116744995
Token: of, Prediction: 1.6111100649141008e-06
Token: oil, Prediction: 0.3882220387458801
Token: affects, Prediction: 0.025909261777997017
Token: many, Prediction: 0.16761015355587006
Token: products, Prediction: 0.1984640210866928
Token: [SEP], Prediction: 0.0
Sum of Predict