In [None]:
!pip install transformers[torch] datasets



In [None]:
import json
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import re
from datasets import Dataset
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import EarlyStoppingCallback

In [None]:
classes = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE',
           'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

mlb = MultiLabelBinarizer(classes=classes)

In [None]:
with open('data/train_for_student.json', 'r', encoding='utf-8') as f:
    train_for_student = json.load(f)


In [None]:
def clean_text(text):
    # Remove special characters except periods
    text = re.sub(r'[^\w\s\.]', ' ', text)
    text = ' '.join(text.split())
    return text

def augment_text(text):
    # Random swap of sentences if there are multiple sentences
    sentences = text.split('.')
    if len(sentences) > 1:
        idx1, idx2 = np.random.randint(0, len(sentences), 2)
        sentences[idx1], sentences[idx2] = sentences[idx2], sentences[idx1]
    return '.'.join(sentences)


def process_data(data):
    rows = []
    all_classes = []

    for id, info in data.items():
        # Original text
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"

        # Augmented text
        if 'Classes' in info:  # Only augment training data
            aug_text = augment_text(text)
            rows.append({"text": aug_text})
            all_classes.append(info['Classes'])

        rows.append({"text": text})
        if 'Classes' in info:
            all_classes.append(info['Classes'])

    df = pd.DataFrame(rows)
    if all_classes:
        one_hot_encoded_labels = mlb.fit_transform(all_classes)
        df['labels'] = [[float(label) for label in labels] for labels in one_hot_encoded_labels]
    return df


In [None]:
train_df = process_data(train_for_student)

train_df.head(5)

Unnamed: 0,text,labels
0,Activated carbon derived from bacterial cellul...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,Activated carbon derived from bacterial cellul...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,The algorithm of static hand gesture recogniti...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
3,The algorithm of static hand gesture recogniti...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, ..."
4,Alternative Redundant Residue Number System Co...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, ..."


In [None]:
train_data, test_data = train_test_split(train_df, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


hg_train_data = Dataset.from_pandas(train_data)
hg_val_data = Dataset.from_pandas(val_data)
hg_test_data = Dataset.from_pandas(test_data)

print(hg_train_data)

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 544
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_dataset(data):
    tokenized_data = tokenizer(
        data['text'],
        truncation=True,
        padding='max_length',
        max_length=512,
    )
    return tokenized_data



In [None]:
dataset_train = hg_train_data.map(tokenize_dataset, batched=True)
dataset_val = hg_val_data.map(tokenize_dataset, batched=True)
dataset_test = hg_test_data.map(tokenize_dataset, batched=True)

print(dataset_train[0])

Map:   0%|          | 0/544 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

Map:   0%|          | 0/182 [00:00<?, ? examples/s]

{'text': 'Vibrational Analysis on Patellofemoral Joint Degradation of Swine s Knee [SEP] Published under licence by IOP Publishing Ltd.Knee s severe patellofemoral joint degeneration gives a lot of people difficulty to endure daily life because walking or any leg movement can cause serious pain. At present physical examination x ray and MRI are often used to diagnose the condition however each of the methods has its own disadvantages. For instance physical examination is highly dependent on the skill of the doctor who practiced the examination which cannot avoid misdiagnosing the condition. X ray can detect wounds and tears but the accuracy is also not top notch since the result of an x ray is a 2 dimension picture. MRI is the most reliable method for diagnosing the condition however the cost is very high. We propose that other than x ray and MRI patellofemoral joint degeneration can be identified by analyzing vibrational signals obtained from an accelerometer attached to the patella w

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(classes),
    problem_type='multi_label_classification'
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Optimizing thresholds for each label
def optimize_thresholds(predictions, labels):
    best_thresholds = []
    for i in range(labels.shape[1]):
        best_f1 = 0
        best_threshold = 0.5
        for threshold in np.arange(0.1, 0.9, 0.05):
            preds = (predictions[:, i] > threshold).astype(int)
            f1 = f1_score(labels[:, i], preds, average='binary')
            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold
        best_thresholds.append(best_threshold)
    return best_thresholds

# Custom compute_metrics function
def compute_metrics(p):
    logits = p.predictions
    labels = p.label_ids
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))

    # Optimize thresholds
    thresholds = optimize_thresholds(probs.numpy(), labels)
    predictions = np.array([(probs[:, i] > thresholds[i]).numpy().astype(int) for i in range(len(thresholds))]).T

    # Calculate F1 score
    f1 = f1_score(y_true=labels, y_pred=predictions, average='macro')
    return {
        'macro_f1': f1,
        'thresholds': thresholds
    }


In [None]:
class MultilabelTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = self.compute_class_weights()

    def compute_class_weights(self):
        labels = self.train_dataset['labels']
        pos_counts = np.sum(labels, axis=0)
        neg_counts = len(labels) - pos_counts
        # Improved weight calculation with smoothing
        pos_weights = np.log1p(neg_counts / (pos_counts + 1))
        return torch.FloatTensor(pos_weights).to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Focal Loss implementation
        alpha = 0.25
        gamma = 2.0

        # Compute BCE loss
        bce_loss = torch.nn.BCEWithLogitsLoss(reduction='none', pos_weight=self.label_weights)
        base_loss = bce_loss(logits.view(-1, self.model.config.num_labels),
                            labels.float().view(-1, self.model.config.num_labels))

        # Apply focal loss modulation
        probs = torch.sigmoid(logits)
        p_t = probs * labels + (1 - probs) * (1 - labels)
        focal_weight = (1 - p_t) ** gamma
        focal_loss = focal_weight * base_loss

        loss = focal_loss.mean()

        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-5,  # Lower learning rate
    num_train_epochs=20,  # More epochs with early stopping
    per_device_train_batch_size=8,  # Smaller batch size
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,  # Increased gradient accumulation
    warmup_ratio=0.2,  # Longer warmup
    weight_decay=0.02,  # Increased weight decay
    logging_steps=10,
    evaluation_strategy="epoch",  # More frequent evaluation
    eval_steps=100,
    save_strategy="epoch",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    fp16=True,
    gradient_checkpointing=True,
    save_total_limit=2,
    seed=42,
    # Additional parameters
    warmup_steps=200,
    adam_epsilon=1e-8,
    max_grad_norm=0.5,  # Reduced gradient clipping
    label_smoothing_factor=0.1  # Label smoothing
)

trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)],
    data_collator=DataCollatorWithPadding(tokenizer)
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [None]:
trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss,Macro F1,Thresholds
1,0.1029,0.113836,0.588822,"[0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.3500000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.3500000000000001, 0.45000000000000007, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001]"
2,0.1023,0.113742,0.589414,"[0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.3500000000000001, 0.45000000000000007, 0.45000000000000007, 0.5000000000000001, 0.40000000000000013, 0.3500000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.5000000000000001]"
3,0.098,0.112205,0.588088,"[0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.45000000000000007, 0.3500000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.40000000000000013, 0.3500000000000001, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001]"
4,0.0976,0.110394,0.598487,"[0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.40000000000000013, 0.3500000000000001, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.45000000000000007]"
5,0.0947,0.109411,0.605353,"[0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5000000000000001, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001]"
6,0.0951,0.109129,0.616129,"[0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.5500000000000002, 0.5000000000000001, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001]"
7,0.0884,0.107531,0.613976,"[0.45000000000000007, 0.40000000000000013, 0.3500000000000001, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.3500000000000001, 0.30000000000000004, 0.5000000000000001, 0.40000000000000013, 0.5000000000000001, 0.5000000000000001]"
8,0.0889,0.106182,0.621058,"[0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.5500000000000002, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.5000000000000001, 0.3500000000000001, 0.30000000000000004, 0.5000000000000001, 0.5000000000000001, 0.5000000000000001, 0.45000000000000007]"
9,0.0854,0.103311,0.649215,"[0.45000000000000007, 0.45000000000000007, 0.3500000000000001, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5000000000000001, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.3500000000000001, 0.5000000000000001, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001]"
10,0.0825,0.102303,0.652208,"[0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5500000000000002, 0.5000000000000001, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001, 0.45000000000000007, 0.40000000000000013, 0.30000000000000004, 0.5000000000000001, 0.40000000000000013, 0.5500000000000002, 0.5000000000000001]"


Trainer is attempting to log a value of "[0.45000000000000007, 0.40000000000000013, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.3500000000000001, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.3500000000000001, 0.45000000000000007, 0.45000000000000007, 0.5000000000000001, 0.5000000000000001]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer is attempting to log a value of "[0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.45000000000000007, 0.3500000000000001, 0.45000000000000007, 0.45000000000000007,

TrainOutput(global_step=340, training_loss=0.0812572016435511, metrics={'train_runtime': 570.0294, 'train_samples_per_second': 19.087, 'train_steps_per_second': 0.596, 'total_flos': 2863059523338240.0, 'train_loss': 0.0812572016435511, 'epoch': 20.0})

In [None]:
optimized_thresholds = [0.45000000000000007, 0.45000000000000007, 0.3500000000000001, 0.5500000000000002, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.45000000000000007, 0.40000000000000013, 0.5000000000000001, 0.5000000000000001, 0.5000000000000001, 0.3500000000000001, 0.3500000000000001, 0.5000000000000001, 0.40000000000000013, 0.45000000000000007, 0.5000000000000001]

test_predictions = trainer.predict(dataset_test)

test_predictions

PredictionOutput(predictions=array([[-0.90234375, -1.4208984 , -0.67578125, ..., -1.2382812 ,
         0.02883911, -1.6005859 ],
       [-0.10638428, -1.4746094 , -1.6679688 , ..., -0.82958984,
        -0.2939453 , -0.7216797 ],
       [-1.1162109 , -0.70996094, -1.1884766 , ..., -1.0722656 ,
         0.23156738, -1.4609375 ],
       ...,
       [-1.2050781 , -0.8051758 , -1.7177734 , ..., -1.28125   ,
        -1.2470703 ,  0.22473145],
       [-1.4111328 , -0.26879883,  0.2644043 , ..., -1.1220703 ,
        -1.3242188 , -0.1026001 ],
       [-0.35888672, -1.1582031 , -1.2294922 , ..., -1.5732422 ,
        -1.2197266 , -0.4880371 ]], dtype=float32), label_ids=array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 1., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]], dtype=float32), metrics={'test_loss': 0.07988698035478592, 'test_macro_f1': 0.7521813299757

In [None]:
# Apply sigmoid to the logits to get probabilities
test_probabilities = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()

# Apply the thresholds to get binary predictions
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over the classes
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

# Extract the true labels
true_labels = test_predictions.label_ids

In [None]:
# Trainer evaluate
trainer.evaluate(dataset_test)

Trainer is attempting to log a value of "[0.5000000000000001, 0.45000000000000007, 0.45000000000000007, 0.5000000000000001, 0.45000000000000007, 0.40000000000000013, 0.5000000000000001, 0.45000000000000007, 0.5500000000000002, 0.5000000000000001, 0.45000000000000007, 0.6000000000000002, 0.5000000000000001, 0.45000000000000007, 0.5000000000000001, 0.45000000000000007, 0.5500000000000002, 0.45000000000000007]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.07988698035478592,
 'eval_macro_f1': 0.7521813299757242,
 'eval_thresholds': [0.5000000000000001,
  0.45000000000000007,
  0.45000000000000007,
  0.5000000000000001,
  0.45000000000000007,
  0.40000000000000013,
  0.5000000000000001,
  0.45000000000000007,
  0.5500000000000002,
  0.5000000000000001,
  0.45000000000000007,
  0.6000000000000002,
  0.5000000000000001,
  0.45000000000000007,
  0.5000000000000001,
  0.45000000000000007,
  0.5500000000000002,
  0.45000000000000007],
 'eval_runtime': 2.4825,
 'eval_samples_per_second': 73.312,
 'eval_steps_per_second': 9.265,
 'epoch': 20.0}

In [None]:
# Compute f1 metric
final_f1 = f1_score(true_labels, binary_predictions, average='macro')

print("\nFinal Evaluation:")
print(f"Macro F1 Score: {final_f1}")


Final Evaluation:
Macro F1 Score: 0.6911226719038173


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
with open('data/test_for_student.json', 'r', encoding='utf-8') as f:
    test_for_student = json.load(f)

In [None]:
def process_test_data(data):
    texts = []
    ids = []
    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        texts.append(text)
        ids.append(id)
    return texts, ids

In [None]:
test_texts, test_ids = process_test_data(test_for_student)

In [None]:
# Tokenize the test data
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same devic

In [None]:
# Make predictions using the model
model.to(device)
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits to probabilities using sigmoid function
sigmoid = torch.nn.Sigmoid()
test_probabilities = sigmoid(logits).cpu().numpy()

In [None]:
# Create a binary predictions array based on thresholds
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over each class
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

In [None]:
output_predictions = []
for i, test_id in enumerate(test_ids):
    row = [test_id] + binary_predictions[i].tolist()
    output_predictions.append(row)

# Convert the output to a DataFrame for easy saving
columns = ["id"] + classes
submission_df = pd.DataFrame(output_predictions, columns=columns)

# Save to a CSV file (optional)
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
