In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import string

In [2]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
df = pd.read_csv('/kaggle/input/economic-documents/english_economic_docs.csv')
df = df.dropna(subset=['text'])

def preprocess_text(text):
    text = re.sub(r'\[\d+\]', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\n', ' ', text)
    text = re.sub(r'\r', '', text)
    text = text.strip()
    return text

df['cleaned_text'] = df['text'].apply(preprocess_text)
corpus = df['cleaned_text'].tolist()
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
words = vectorizer.get_feature_names_out()

df_tfidf = pd.DataFrame(X.T.toarray(), index=words, columns=[f"Doc {i+1}" for i in range(len(corpus))])

df_tfidf

Unnamed: 0,Doc 1,Doc 2,Doc 3,Doc 4,Doc 5,Doc 6,Doc 7,Doc 8,Doc 9,Doc 10,...,Doc 1386,Doc 1387,Doc 1388,Doc 1389,Doc 1390,Doc 1391,Doc 1392,Doc 1393,Doc 1394,Doc 1395
00,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
000,0.0,0.0,0.018049,0.0,0.0,0.0,0.000963,0.0,0.009711,0.006896,...,0.0,0.038235,0.007421,0.0,0.0,0.0,0.012977,0.0,0.0184,0.0
0001,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
0002,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
000800,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
청산리방법,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
한국은행,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
햄연지,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0
현지지도,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000,0.000000,...,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0000,0.0


In [4]:
top_words_list = []

for doc_index in range(len(corpus)):
    important_words = df_tfidf.iloc[:, doc_index]
    vocab_size = (important_words > 0).sum()
    
    N = round(vocab_size * 0.15)
    
    top_important_words = important_words.sort_values(ascending=False).head(N)
    
    rounded_scores = [round(score, 4) for score in top_important_words.values.tolist()]
    
    top_words_list.append({
        "Document": f"Doc {doc_index + 1}",
        "Vocabulary Size": vocab_size,
        "N": N,
        "Top N Words": top_important_words.index.tolist(),
        "TF-IDF Scores": rounded_scores
    })

top_words_df = pd.DataFrame(top_words_list)

top_words_df

Unnamed: 0,Document,Vocabulary Size,N,Top N Words,TF-IDF Scores
0,Doc 1,156,23,"[university, hcm, vietnam, of, vnu, law, and, ...","[0.3857, 0.291, 0.268, 0.2576, 0.2256, 0.2085,..."
1,Doc 2,1102,165,"[the, buyout, of, buyouts, leveraged, equity, ...","[0.4737, 0.3434, 0.2894, 0.2842, 0.243, 0.2122..."
2,Doc 3,1836,275,"[the, scammer, victim, scam, to, scammers, sca...","[0.4544, 0.3612, 0.3058, 0.2653, 0.2137, 0.199..."
3,Doc 4,451,68,"[the, company, traded, publicly, of, public, s...","[0.4355, 0.3394, 0.2757, 0.2141, 0.1949, 0.179..."
4,Doc 5,99,15,"[vnpt, the, vietnam, vinasat, vietnamese, of, ...","[0.7007, 0.2143, 0.214, 0.2116, 0.1568, 0.1288..."
...,...,...,...,...,...
1390,Doc 1391,227,34,"[businesswomen, she, businesswoman, asia, in, ...","[0.3283, 0.3239, 0.3107, 0.2886, 0.2529, 0.248..."
1391,Doc 1392,723,108,"[reclaimed, reclamation, the, land, of, sq, mi...","[0.373, 0.3409, 0.3313, 0.3272, 0.2309, 0.1613..."
1392,Doc 1393,78,12,"[win, game, scenario, longmans, lose, follett,...","[0.6673, 0.3668, 0.2591, 0.1506, 0.1467, 0.136..."
1393,Doc 1394,1658,249,"[madoff, the, of, he, his, and, to, in, was, f...","[0.8426, 0.2317, 0.1277, 0.1277, 0.1267, 0.125..."


In [5]:
labeled_sentences = []

for doc_index in range(len(corpus)):
    document = corpus[doc_index]
    
    sentences = document.split('.')

    for sentence in sentences:
        if sentence == []: continue
        tokens = sentence.split()
        
        top_important_words = top_words_df.iloc[doc_index]["Top N Words"]
        
        sentence_labels = []
        for token in tokens:
            token_lower = token.lower()

            if token_lower in stop_words:
                sentence_labels.append(0)
            elif token_lower in top_important_words:
                sentence_labels.append(1)
            else:
                sentence_labels.append(0)
        
        labeled_sentences.append({
            "text": sentence,
            "tokens": tokens,
            "labels": sentence_labels
        })

labeled_df = pd.DataFrame(labeled_sentences)
labeled_df

Unnamed: 0,text,tokens,labels
0,English The University of Economics and Law (U...,"[English, The, University, of, Economics, and,...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,It is a member institution of university of V...,"[It, is, a, member, institution, of, universit...","[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0]"
2,It was previously known as VNU-HCM Faculty of...,"[It, was, previously, known, as, VNU-HCM, Facu...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,To be able to meet the demands of socio-econo...,"[To, be, able, to, meet, the, demands, of, soc...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,As one of the renowned and well-known univers...,"[As, one, of, the, renowned, and, well-known, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ..."
...,...,...,...
168051,He had two sons and a daughter from an earlie...,"[He, had, two, sons, and, a, daughter, from, a...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
168052,One of his sons had predeceased him in a car ...,"[One, of, his, sons, had, predeceased, him, in...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
168053,"Mundell died on April 4, 2021, from cholangio...","[Mundell, died, on, April, 4,, 2021,, from, ch...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
168054,He was aged 89,"[He, was, aged, 89]","[0, 0, 0, 0]"


In [6]:
labeled_df = labeled_df[labeled_df['tokens'].apply(lambda x: x != [])]
labeled_df

Unnamed: 0,text,tokens,labels
0,English The University of Economics and Law (U...,"[English, The, University, of, Economics, and,...","[0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,It is a member institution of university of V...,"[It, is, a, member, institution, of, universit...","[0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0]"
2,It was previously known as VNU-HCM Faculty of...,"[It, was, previously, known, as, VNU-HCM, Facu...","[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,To be able to meet the demands of socio-econo...,"[To, be, able, to, meet, the, demands, of, soc...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, ..."
4,As one of the renowned and well-known univers...,"[As, one, of, the, renowned, and, well-known, ...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, ..."
...,...,...,...
168051,He had two sons and a daughter from an earlie...,"[He, had, two, sons, and, a, daughter, from, a...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
168052,One of his sons had predeceased him in a car ...,"[One, of, his, sons, had, predeceased, him, in...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]"
168053,"Mundell died on April 4, 2021, from cholangio...","[Mundell, died, on, April, 4,, 2021,, from, ch...","[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
168054,He was aged 89,"[He, was, aged, 89]","[0, 0, 0, 0]"


In [7]:
from transformers import AutoTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from datasets import Dataset, DatasetDict
import torch

In [8]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = BertForTokenClassification.from_pretrained('bert-base-uncased', num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [9]:
data = []

for index, row in labeled_df.iterrows():
    data.append({
        'tokens': row['tokens'],
        'labels': row['labels']
    })

dataset = Dataset.from_pandas(labeled_df)
split_dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=42)
train_dataset = split_dataset['train']
eval_dataset = split_dataset['test']

In [10]:
def tokenize_and_align_labels(row):
    tokenized_inputs = tokenizer(row['tokens'], padding='max_length', max_length=128, truncation=True, is_split_into_words=True)
    word_ids = tokenized_inputs.word_ids()
    
    labels = row['labels']
    aligned_labels = []
    
    for i, word_id in enumerate(word_ids):
        if word_id is None:
            aligned_labels.append(0)
        else:
            aligned_labels.append(labels[word_id])

    if len(aligned_labels) > 128:
        aligned_labels = aligned_labels[:128]
    elif len(aligned_labels) < 128:
        aligned_labels = aligned_labels + [0] * (128 - len(aligned_labels))
    
    tokenized_inputs['labels'] = aligned_labels
    return tokenized_inputs

train_tokenized_dataset = train_dataset.map(tokenize_and_align_labels, batched=False)
train_tokenized_dataset

Map:   0%|          | 0/132277 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 132277
})

In [11]:
eval_tokenized_dataset = eval_dataset.map(tokenize_and_align_labels, batched=False)
eval_tokenized_dataset

Map:   0%|          | 0/33070 [00:00<?, ? examples/s]

Dataset({
    features: ['text', 'tokens', 'labels', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33070
})

In [12]:
training_args = TrainingArguments(
    output_dir='/kaggle/working/',
    num_train_epochs=5,
    per_device_train_batch_size=128,
    per_device_eval_batch_size=128,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/kaggle/working/logs',
    report_to='none',
    evaluation_strategy="epoch",
    eval_steps=None,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized_dataset,
    eval_dataset=eval_tokenized_dataset,
)

trainer.train()



Epoch,Training Loss,Validation Loss
1,0.0911,0.044466
2,0.0429,0.042425
3,0.0389,0.041696
4,0.0357,0.042145
5,0.0332,0.043809




TrainOutput(global_step=2585, training_loss=0.04784633324731941, metrics={'train_runtime': 8171.6377, 'train_samples_per_second': 80.937, 'train_steps_per_second': 0.316, 'total_flos': 4.320443886345984e+16, 'train_loss': 0.04784633324731941, 'epoch': 5.0})

In [13]:
test_sentences = [
    "This is the worst university in Vietnam",
    "The price of oil affects many products",
    "People buy goods when they have more money",
    "A job loss can hurt the economy",
    "Banks lend money to help businesses grow",
    "Inflation makes prices go up",
    "Workers earn wages for their labor",
    "Saving money is important for the future",
    "Taxes help pay for government services",
    "When the economy grows, people can find more jobs",
    "Demand for food goes up during holidays"
]

model.eval()
for test_sentence in test_sentences:
    test_tokens = test_sentence.split()
    inputs = tokenizer(test_tokens, padding='max_length', max_length=128, truncation=True, return_tensors="pt", is_split_into_words=True)
    inputs = inputs.to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1)
    predicted_labels = predictions.cpu().numpy()
    
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    print(f"Testing sentence: {test_sentence}")
    for token, label in zip(tokens, predicted_labels[0]):
        if token != '[PAD]': 
            print(f"Token: {token}, Prediction: {label}")
    print("\n" + "-"*50 + "\n")

Testing sentence: This is the worst university in Vietnam
Token: [CLS], Prediction: 0
Token: this, Prediction: 0
Token: is, Prediction: 0
Token: the, Prediction: 0
Token: worst, Prediction: 0
Token: university, Prediction: 1
Token: in, Prediction: 0
Token: vietnam, Prediction: 1
Token: [SEP], Prediction: 0

--------------------------------------------------

Testing sentence: The price of oil affects many products
Token: [CLS], Prediction: 0
Token: the, Prediction: 0
Token: price, Prediction: 1
Token: of, Prediction: 0
Token: oil, Prediction: 1
Token: affects, Prediction: 0
Token: many, Prediction: 1
Token: products, Prediction: 1
Token: [SEP], Prediction: 0

--------------------------------------------------

Testing sentence: People buy goods when they have more money
Token: [CLS], Prediction: 0
Token: people, Prediction: 1
Token: buy, Prediction: 1
Token: goods, Prediction: 1
Token: when, Prediction: 0
Token: they, Prediction: 0
Token: have, Prediction: 0
Token: more, Prediction: 0
