In [None]:
!pip install transformers[torch] datasets nltk



In [None]:
import json
import pandas as pd
import numpy as np
import torch
import re
from datasets import Dataset
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from nltk.corpus import wordnet
import nltk
from nltk.tokenize import word_tokenize
import random

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
classes = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE',
           'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

mlb = MultiLabelBinarizer(classes=classes)

In [None]:
with open('data/train_for_student.json', 'r', encoding='utf-8') as f:
    train_for_student = json.load(f)


In [None]:
# Improved text preprocessing
def clean_text(text):
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s\.\-\,]', '', text)
    return text.lower().strip()

# Enhanced data augmentation techniques
def synonym_replacement(words, n):
    new_words = words.copy()
    random_word_list = list(set([word for word in words if word.isalnum()]))
    random.shuffle(random_word_list)
    num_replaced = 0
    for random_word in random_word_list:
        synonyms = []
        for syn in wordnet.synsets(random_word):
            for l in syn.lemmas():
                synonyms.append(l.name())
        if len(synonyms) >= 1:
            synonym = random.choice(list(set(synonyms)))
            new_words = [synonym if word == random_word else word for word in new_words]
            num_replaced += 1
        if num_replaced >= n:
            break
    return new_words

def random_insertion(words, n):
    new_words = words.copy()
    for _ in range(n):
        add_word(new_words)
    return new_words

def add_word(new_words):
    synonyms = []
    counter = 0
    while len(synonyms) < 1:
        random_word = new_words[random.randint(0, len(new_words)-1)]
        synonyms = get_synonyms(random_word)
        counter += 1
        if counter >= 10:
            return
    random_synonym = synonyms[0]
    random_idx = random.randint(0, len(new_words)-1)
    new_words.insert(random_idx, random_synonym)

def get_synonyms(word):
    synonyms = set()
    for syn in wordnet.synsets(word):
        for l in syn.lemmas():
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym)
    if word in synonyms:
        synonyms.remove(word)
    return list(synonyms)

def random_swap(words, n):
    new_words = words.copy()
    for _ in range(n):
        new_words = swap_word(new_words)
    return new_words

def swap_word(new_words):
    random_idx_1 = random.randint(0, len(new_words)-1)
    random_idx_2 = random_idx_1
    counter = 0
    while random_idx_2 == random_idx_1:
        random_idx_2 = random.randint(0, len(new_words)-1)
        counter += 1
        if counter > 3:
            return new_words
    new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
    return new_words

def random_deletion(words, p):
    if len(words) == 1:
        return words
    new_words = []
    for word in words:
        r = random.uniform(0, 1)
        if r > p:
            new_words.append(word)
    if len(new_words) == 0:
        rand_int = random.randint(0, len(words)-1)
        return [words[rand_int]]
    return new_words

def eda(sentence, alpha_sr=0.1, alpha_ri=0.1, alpha_rs=0.1, p_rd=0.1, num_aug=9):
    words = word_tokenize(sentence)
    words = [word for word in words if word is not None]
    num_words = len(words)

    augmented_sentences = []
    num_new_per_technique = int(num_aug/4)+1

    #sr
    for _ in range(num_new_per_technique):
        n_sr = max(1, int(alpha_sr*num_words))
        a_words = synonym_replacement(words, n_sr)
        augmented_sentences.append(' '.join(a_words))

    #ri
    for _ in range(num_new_per_technique):
        n_ri = max(1, int(alpha_ri*num_words))
        a_words = random_insertion(words, n_ri)
        augmented_sentences.append(' '.join(a_words))

    #rs
    for _ in range(num_new_per_technique):
        n_rs = max(1, int(alpha_rs*num_words))
        a_words = random_swap(words, n_rs)
        augmented_sentences.append(' '.join(a_words))

    #rd
    for _ in range(num_new_per_technique):
        a_words = random_deletion(words, p_rd)
        augmented_sentences.append(' '.join(a_words))

    augmented_sentences = [sentence for sentence in augmented_sentences if len(sentence) > 10]
    random.shuffle(augmented_sentences)

    if num_aug >= 1:
        augmented_sentences = augmented_sentences[:num_aug]
    else:
        keep_prob = num_aug / len(augmented_sentences)
        augmented_sentences = [s for s in augmented_sentences if random.uniform(0, 1) < keep_prob]

    augmented_sentences.append(sentence)
    return augmented_sentences

def augment_text(text, title_length, n_aug=9):
    title = text[:title_length]
    body = text[title_length:]
    augmented_bodies = eda(body, num_aug=n_aug)
    return [f"{title}{aug_body}" for aug_body in augmented_bodies]

def process_data(data, augment=True):
    rows = []
    all_classes = []

    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        title_length = len(title) + 6

        if augment and 'Classes' in info:
            texts = augment_text(text, title_length)
            for aug_text in texts:
                rows.append({"text": aug_text})
                all_classes.append(info['Classes'])
        else:
            rows.append({"text": text})
            if 'Classes' in info:
                all_classes.append(info['Classes'])

    df = pd.DataFrame(rows)
    if all_classes:
        labels = mlb.fit_transform(all_classes)
        df['labels'] = [label.tolist() for label in labels]
    return df

In [None]:
train_df = process_data(train_for_student)

train_df.head(5)

Unnamed: 0,text,labels
0,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
3,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
4,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."


In [None]:
train_data, test_data = train_test_split(train_df, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


hg_train_data = Dataset.from_pandas(train_data)
hg_val_data = Dataset.from_pandas(val_data)
hg_test_data = Dataset.from_pandas(test_data)

print(hg_train_data)

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 2724
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_dataset(data):
    tokenized_data = tokenizer(
        data['text'],
        truncation=True,
        padding='max_length',
        max_length=512,
    )
    return tokenized_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
dataset_train = hg_train_data.map(tokenize_dataset, batched=True)
dataset_val = hg_val_data.map(tokenize_dataset, batched=True)
dataset_test = hg_test_data.map(tokenize_dataset, batched=True)

print(dataset_train[0])

Map:   0%|          | 0/2724 [00:00<?, ? examples/s]

Map:   0%|          | 0/908 [00:00<?, ? examples/s]

Map:   0%|          | 0/908 [00:00<?, ? examples/s]

{'text': 'final electrical test process enhancement for integrated circuits [SEP] 2019 ieee.this research was conducted in a manufacturing company producing integrated circuits for global customers. the purpose of this research was to identify ways to reduce invalid defects from the contact open test in the final test process. a cause and effect diagram was used to identify possible causes of problems. after identifying the root causes, they were prioritised by applying the pareto concept to failure mode and effects analysis fmea to identify the causes with the most significant impact. the ahp analytic hierarchy process method was then used to select the most appropriate machine while design of experiment doe was carried out to find the significant factors and optimal parameter settings. in addition, the control plan and standard operating procedure were developed to eliminate other defects. based on the result, this research was a successful solution for the company since it reduced t

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(classes),
    problem_type='multi_label_classification'
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Optimizing thresholds for each label
def optimize_thresholds(predictions, labels, num_points=100):
    best_thresholds = []

    for i in range(predictions.shape[1]):
        best_f1 = 0
        best_threshold = 0.5

        # Get predictions for current class
        class_predictions = predictions[:, i]
        sorted_preds = np.sort(class_predictions)

        # Try different threshold values
        for percentile in range(1, num_points):
            threshold = sorted_preds[int(len(sorted_preds) * (percentile/num_points))]
            binary_predictions = (class_predictions >= threshold).astype(int)
            f1 = f1_score(labels[:, i], binary_predictions)

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        best_thresholds.append(best_threshold)

    return best_thresholds

# Custom compute_metrics function
def compute_metrics(p):
    logits = p.predictions
    labels = p.label_ids
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))

    # Optimize thresholds
    thresholds = optimize_thresholds(probs.numpy(), labels)
    predictions = np.array([(probs[:, i] > thresholds[i]).numpy().astype(int) for i in range(len(thresholds))]).T

    # Calculate F1 score
    f1 = f1_score(y_true=labels, y_pred=predictions, average='macro')
    return {
        'macro_f1': f1,
        'thresholds': thresholds
    }


In [None]:
class MultilabelTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = self.compute_class_weights()

    def compute_class_weights(self):
        labels = np.array(self.train_dataset['labels'])
        pos_counts = np.sum(labels, axis=0)
        neg_counts = len(labels) - pos_counts
        weights = np.sqrt(neg_counts / (pos_counts + 1))  # Square root smoothing
        return torch.FloatTensor(weights).to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Improved focal loss with dynamic alpha
        alpha = 0.75  # Increased focus on positive samples
        gamma = 2.0

        bce_loss = torch.nn.BCEWithLogitsLoss(reduction='none',
                                             pos_weight=self.label_weights)
        base_loss = bce_loss(logits, labels.float())

        probs = torch.sigmoid(logits)
        p_t = probs * labels + (1 - probs) * (1 - labels)
        focal_weight = ((1 - p_t) ** gamma) * alpha

        loss = (focal_weight * base_loss).mean()

        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=2e-5,
    num_train_epochs=20,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    fp16=True,
    gradient_checkpointing=True,
    save_total_limit=2,
    seed=42
)

trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=DataCollatorWithPadding(tokenizer)
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Macro F1,Thresholds
100,0.1375,0.123189,0.383564,"[0.4634745717048645, 0.4339214563369751, 0.41063809394836426, 0.4657519459724426, 0.43452128767967224, 0.3993430435657501, 0.42811381816864014, 0.4672406017780304, 0.4391760230064392, 0.35353976488113403, 0.49486562609672546, 0.43590155243873596, 0.5364646911621094, 0.4221472442150116, 0.4323030412197113, 0.35667070746421814, 0.4545024335384369, 0.4797626733779907]"
200,0.0995,0.0884,0.640592,"[0.5439535975456238, 0.4536249041557312, 0.46566084027290344, 0.5270883440971375, 0.4883520007133484, 0.44984573125839233, 0.45489591360092163, 0.4888705611228943, 0.48239865899086, 0.5006413459777832, 0.4646890163421631, 0.49806973338127136, 0.5344606637954712, 0.4965553879737854, 0.49237120151519775, 0.441040962934494, 0.5127002000808716, 0.5115107297897339]"
300,0.0723,0.064804,0.780781,"[0.5024795532226562, 0.5708099007606506, 0.45471426844596863, 0.5590493679046631, 0.5370413661003113, 0.4681827425956726, 0.4696570634841919, 0.5360700488090515, 0.4532316029071808, 0.5401659607887268, 0.5038318037986755, 0.5091542601585388, 0.4821700155735016, 0.509512722492218, 0.5082084536552429, 0.5329720973968506, 0.5080788135528564, 0.49108219146728516]"
400,0.051,0.044086,0.895936,"[0.4893280863761902, 0.48677369952201843, 0.5279858708381653, 0.4904949367046356, 0.5087195038795471, 0.5205115675926208, 0.4564097821712494, 0.4656001031398773, 0.5857529640197754, 0.6500231027603149, 0.5176928043365479, 0.5212122201919556, 0.5527207851409912, 0.5527207851409912, 0.5101075768470764, 0.6173973083496094, 0.5432570576667786, 0.5166107416152954]"
500,0.035,0.030036,0.950676,"[0.587765634059906, 0.49163898825645447, 0.43236297369003296, 0.5427724123001099, 0.5211817622184753, 0.5135388374328613, 0.5106948018074036, 0.5331239700317383, 0.5876473188400269, 0.4678179919719696, 0.5004594326019287, 0.5748125314712524, 0.5215929746627808, 0.48136240243911743, 0.5051267743110657, 0.6808785796165466, 0.5611246228218079, 0.5211055874824524]"
600,0.0245,0.02102,0.971314,"[0.6059739589691162, 0.4885731637477875, 0.37903353571891785, 0.5450435876846313, 0.5856937170028687, 0.5333366990089417, 0.5624769330024719, 0.4880470335483551, 0.6428809762001038, 0.4157288372516632, 0.4929966330528259, 0.6076632142066956, 0.5399536490440369, 0.4937174916267395, 0.47158825397491455, 0.6875250935554504, 0.5973730087280273, 0.5139352679252625]"
700,0.0171,0.015202,0.978604,"[0.64029860496521, 0.4701283872127533, 0.3954259753227234, 0.5933147668838501, 0.6791787147521973, 0.47344422340393066, 0.45513808727264404, 0.5439233779907227, 0.6677979230880737, 0.4245312809944153, 0.5153226852416992, 0.6304574012756348, 0.4978199005126953, 0.5798763036727905, 0.5054090023040771, 0.6529056429862976, 0.6078377962112427, 0.421016126871109]"
800,0.0135,0.011171,0.981182,"[0.6784334778785706, 0.4887408912181854, 0.39367637038230896, 0.642432451248169, 0.6807724833488464, 0.5136913061141968, 0.5139886736869812, 0.42375609278678894, 0.642993152141571, 0.3939094841480255, 0.5310877561569214, 0.6417592167854309, 0.4479130506515503, 0.5945512652397156, 0.4353613257408142, 0.7292305827140808, 0.512174129486084, 0.46550899744033813]"
900,0.0103,0.00898,0.98297,"[0.6951311230659485, 0.3752484917640686, 0.4028031826019287, 0.6659538745880127, 0.7171109318733215, 0.4208376109600067, 0.530084490776062, 0.4250681698322296, 0.704871654510498, 0.40874868631362915, 0.5028151869773865, 0.6553360223770142, 0.40945693850517273, 0.6405234336853027, 0.4374629855155945, 0.7057850360870361, 0.563978374004364, 0.5115259885787964]"
1000,0.0085,0.007394,0.98297,"[0.7516481280326843, 0.37330448627471924, 0.36285629868507385, 0.6555565595626831, 0.7115311622619629, 0.34687307476997375, 0.5619661808013916, 0.35231319069862366, 0.6981236934661865, 0.3589146137237549, 0.47488993406295776, 0.7002802491188049, 0.4104018211364746, 0.5994850993156433, 0.3809892237186432, 0.7371581792831421, 0.5687152743339539, 0.41353633999824524]"


Trainer is attempting to log a value of "[0.4634745717048645, 0.4339214563369751, 0.41063809394836426, 0.4657519459724426, 0.43452128767967224, 0.3993430435657501, 0.42811381816864014, 0.4672406017780304, 0.4391760230064392, 0.35353976488113403, 0.49486562609672546, 0.43590155243873596, 0.5364646911621094, 0.4221472442150116, 0.4323030412197113, 0.35667070746421814, 0.4545024335384369, 0.4797626733779907]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer is attempting to log a value of "[0.5439535975456238, 0.4536249041557312, 0.46566084027290344, 0.5270883440971375, 0.4883520007133484, 0.44984573125839233, 0.45489591360092163, 0.4888705611228943, 0.48239865899086, 0.5006413459777832, 0.4646890163421631, 0.49806973

TrainOutput(global_step=1200, training_loss=0.043947543328007065, metrics={'train_runtime': 1542.2914, 'train_samples_per_second': 52.986, 'train_steps_per_second': 1.653, 'total_flos': 1.0060706957377536e+16, 'train_loss': 0.043947543328007065, 'epoch': 14.035087719298245})

In [None]:
optimized_thresholds = [0.61925291046766, 0.44774172613880536, 0.41414180606778695,
                        0.57874606204562, 0.60049749592320619, 0.45082401185132921,
                        0.53109708623883023, 0.45403000155140193, 0.60902179300296211,
                        0.42565307019191844, 0.50018945232073527, 0.59107437696539019,
                        0.47245547015847508, 0.55197602234063902, 0.46691181553592994,
                        0.62757920445700602, 0.52659046101203611, 0.48735158959652163]
test_predictions = trainer.predict(dataset_test)

test_predictions

PredictionOutput(predictions=array([[-1.796875  , -2.5273438 , -1.2998047 , ..., -1.5449219 ,
         1.7285156 , -1.59375   ],
       [-1.6074219 , -1.9453125 ,  1.8339844 , ..., -1.6660156 ,
         1.3505859 , -1.3964844 ],
       [-1.6074219 , -2.3769531 , -1.7666016 , ..., -1.3710938 ,
         1.6025391 , -1.9404297 ],
       ...,
       [-1.7392578 , -0.95458984,  0.9604492 , ..., -1.0488281 ,
        -1.0253906 ,  2.1230469 ],
       [-1.6787109 , -1.78125   , -2.234375  , ..., -2.5761719 ,
        -1.4150391 , -1.1035156 ],
       [-1.2773438 , -0.7597656 , -1.2978516 , ..., -1.6445312 ,
        -1.7451172 , -0.4633789 ]], dtype=float32), label_ids=array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 1, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]]), metrics={'test_loss': 0.008786835707724094, 'test_macro_f1': 0.9861547605171077, 'test_thresholds': [0.3970029056072235, 0.643

In [None]:
# Apply sigmoid to the logits to get probabilities
test_probabilities = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()

# Apply the thresholds to get binary predictions
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over the classes
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

# Extract the true labels
true_labels = test_predictions.label_ids

In [None]:
# Trainer evaluate
trainer.evaluate(dataset_test)

Trainer is attempting to log a value of "[0.3970029056072235, 0.6433292627334595, 0.39916738867759705, 0.40910276770591736, 0.4239349663257599, 0.4163219630718231, 0.35711899399757385, 0.4184592366218567, 0.698535144329071, 0.7848142385482788, 0.5111141800880432, 0.6610482335090637, 0.7259401679039001, 0.5082160830497742, 0.4765948951244354, 0.3432316780090332, 0.6710395812988281, 0.5670377612113953]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.008786835707724094,
 'eval_macro_f1': 0.9861547605171077,
 'eval_thresholds': [0.3970029056072235,
  0.6433292627334595,
  0.39916738867759705,
  0.40910276770591736,
  0.4239349663257599,
  0.4163219630718231,
  0.35711899399757385,
  0.4184592366218567,
  0.698535144329071,
  0.7848142385482788,
  0.5111141800880432,
  0.6610482335090637,
  0.7259401679039001,
  0.5082160830497742,
  0.4765948951244354,
  0.3432316780090332,
  0.6710395812988281,
  0.5670377612113953],
 'eval_runtime': 12.6484,
 'eval_samples_per_second': 71.788,
 'eval_steps_per_second': 4.506,
 'epoch': 14.035087719298245}

In [None]:
# Compute f1 metric
final_f1 = f1_score(true_labels, binary_predictions, average='macro')

print("\nFinal Evaluation:")
print(f"Macro F1 Score: {final_f1}")


Final Evaluation:
Macro F1 Score: 0.9940481326918496


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('data/test_for_student.json', 'r', encoding='utf-8') as f:
    test_for_student = json.load(f)

In [None]:
def process_test_data(data):
    texts = []
    ids = []
    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        texts.append(text)
        ids.append(id)
    return texts, ids

In [None]:
test_texts, test_ids = process_test_data(test_for_student)

In [None]:
# Tokenize the test data
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same devic

In [None]:
# Make predictions using the model
model.to(device)
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits to probabilities using sigmoid function
sigmoid = torch.nn.Sigmoid()
test_probabilities = sigmoid(logits).cpu().numpy()

In [None]:
# Create a binary predictions array based on thresholds
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over each class
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

In [None]:
output_predictions = []
for i, test_id in enumerate(test_ids):
    row = [test_id] + binary_predictions[i].tolist()
    output_predictions.append(row)

# Convert the output to a DataFrame for easy saving
columns = ["id"] + classes
submission_df = pd.DataFrame(output_predictions, columns=columns)

# Save to a CSV file (optional)
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
