In [5]:
!pip install transformers[torch] datasets



In [6]:
import json
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import re
from datasets import Dataset
from sklearn.metrics import f1_score
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import EarlyStoppingCallback

In [7]:
classes = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE',
           'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

mlb = MultiLabelBinarizer(classes=classes)

In [8]:
with open('data/train_for_student.json', 'r', encoding='utf-8') as f:
    train_for_student = json.load(f)


In [9]:
# 1. Improved text preprocessing
def clean_text(text):
    # More comprehensive text cleaning
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s\.\-\,]', '', text)  # Keep periods, hyphens, and commas
    text = text.lower().strip()
    return text

# 2. Enhanced data augmentation
def augment_text(text, title_length):
    augmentations = []

    # Original text
    augmentations.append(text)

    # Back translation simulation (simplified version)
    sentences = text[title_length:].split('.')
    if len(sentences) > 1:
        shuffled_sentences = sentences.copy()
        np.random.shuffle(shuffled_sentences)
        augmented = text[:title_length] + '.'.join(shuffled_sentences)
        augmentations.append(augmented)

    # Synonym replacement simulation
    technical_terms = ['analysis', 'study', 'research', 'development', 'investigation']
    for term in technical_terms:
        if term in text.lower():
            for replacement in technical_terms:
                if replacement != term:
                    augmented = text.lower().replace(term, replacement)
                    augmentations.append(augmented)
                    break

    return augmentations

def process_data(data, augment=True):
    rows = []
    all_classes = []

    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        title_length = len(title) + 6  # length of title plus [SEP]

        if augment and 'Classes' in info:
            texts = augment_text(text, title_length)
            for aug_text in texts:
                rows.append({"text": aug_text})
                all_classes.append(info['Classes'])
        else:
            rows.append({"text": text})
            if 'Classes' in info:
                all_classes.append(info['Classes'])

    df = pd.DataFrame(rows)
    if all_classes:
        mlb = MultiLabelBinarizer(classes=classes)
        labels = mlb.fit_transform(all_classes)
        df['labels'] = [label.tolist() for label in labels]
    return df

In [10]:
train_df = process_data(train_for_student)

train_df.head(5)

Unnamed: 0,text,labels
0,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
1,activated carbon derived from bacterial cellul...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ..."
2,the algorithm of static hand gesture recogniti...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
3,the algorithm of static hand gesture recogniti...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
4,the algorithm of static hand gesture recogniti...,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."


In [11]:
train_data, test_data = train_test_split(train_df, test_size=0.4, random_state=42)
val_data, test_data = train_test_split(test_data, test_size=0.5, random_state=42)


hg_train_data = Dataset.from_pandas(train_data)
hg_val_data = Dataset.from_pandas(val_data)
hg_test_data = Dataset.from_pandas(test_data)

print(hg_train_data)

Dataset({
    features: ['text', 'labels', '__index_level_0__'],
    num_rows: 786
})


In [12]:
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_dataset(data):
    tokenized_data = tokenizer(
        data['text'],
        truncation=True,
        padding='max_length',
        max_length=512,
    )
    return tokenized_data

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [13]:
dataset_train = hg_train_data.map(tokenize_dataset, batched=True)
dataset_val = hg_val_data.map(tokenize_dataset, batched=True)
dataset_test = hg_test_data.map(tokenize_dataset, batched=True)

print(dataset_train[0])

Map:   0%|          | 0/786 [00:00<?, ? examples/s]

Map:   0%|          | 0/262 [00:00<?, ? examples/s]

Map:   0%|          | 0/263 [00:00<?, ? examples/s]

{'text': 'heterogeneous photocatalytic degradation of diuron on zinc oxide influence of surface-dependent adsorption on kinetics, degradation pathway, and toxicity of intermediates [SEP] 2019heterogeneous photocatalytic reaction has been generally applied for degradation of toxic contaminants. degradations of a compound using the same kind of catalyst that was synthesized differently are commonly found in literature. however, the reported degradation intermediates are normally inconsistent. this issue is especially important for the degradation of toxic compounds because intermediates may be more toxic than their parent compounds and understanding the reason is necessary if appropriate catalysts are to be designed. this work systematically compares the photocatalytic degradation of diuron, a toxic recalcitrant herbicide, on two forms of zinc oxide zno, i.e., conventional particles with zinc- and oxygen-terminated polar surfaces as the dominating planes, and nanorods with mixed-terminat

In [14]:
model = AutoModelForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=len(classes),
    problem_type='multi_label_classification'
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Optimizing thresholds for each label
def optimize_thresholds(predictions, labels, num_points=100):
    best_thresholds = []

    for i in range(predictions.shape[1]):
        best_f1 = 0
        best_threshold = 0.5

        # Get predictions for current class
        class_predictions = predictions[:, i]
        sorted_preds = np.sort(class_predictions)

        # Try different threshold values
        for percentile in range(1, num_points):
            threshold = sorted_preds[int(len(sorted_preds) * (percentile/num_points))]
            binary_predictions = (class_predictions >= threshold).astype(int)
            f1 = f1_score(labels[:, i], binary_predictions)

            if f1 > best_f1:
                best_f1 = f1
                best_threshold = threshold

        best_thresholds.append(best_threshold)

    return best_thresholds

# Custom compute_metrics function
def compute_metrics(p):
    logits = p.predictions
    labels = p.label_ids
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))

    # Optimize thresholds
    thresholds = optimize_thresholds(probs.numpy(), labels)
    predictions = np.array([(probs[:, i] > thresholds[i]).numpy().astype(int) for i in range(len(thresholds))]).T

    # Calculate F1 score
    f1 = f1_score(y_true=labels, y_pred=predictions, average='macro')
    return {
        'macro_f1': f1,
        'thresholds': thresholds
    }


In [16]:
class MultilabelTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.label_weights = self.compute_class_weights()

    def compute_class_weights(self):
        labels = np.array(self.train_dataset['labels'])
        pos_counts = np.sum(labels, axis=0)
        neg_counts = len(labels) - pos_counts
        weights = np.sqrt(neg_counts / (pos_counts + 1))  # Square root smoothing
        return torch.FloatTensor(weights).to(self.args.device)

    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # Improved focal loss with dynamic alpha
        alpha = 0.75  # Increased focus on positive samples
        gamma = 2.0

        bce_loss = torch.nn.BCEWithLogitsLoss(reduction='none',
                                             pos_weight=self.label_weights)
        base_loss = bce_loss(logits, labels.float())

        probs = torch.sigmoid(logits)
        p_t = probs * labels + (1 - probs) * (1 - labels)
        focal_weight = ((1 - p_t) ** gamma) * alpha

        loss = (focal_weight * base_loss).mean()

        return (loss, outputs) if return_outputs else loss

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    learning_rate=1e-5,  # Lower learning rate for better convergence
    num_train_epochs=50,  # More epochs with early stopping
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=2,
    warmup_ratio=0.1,
    weight_decay=0.01,
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="steps",
    save_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='macro_f1',
    greater_is_better=True,
    fp16=True,
    gradient_checkpointing=True,
    save_total_limit=2,
    seed=42
)

trainer = MultilabelTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_train,
    eval_dataset=dataset_val,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    data_collator=DataCollatorWithPadding(tokenizer)
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)


In [17]:

trainer.train()

  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Macro F1,Thresholds
100,0.1318,0.122755,0.433695,"[0.399518758058548, 0.43092551827430725, 0.3949008285999298, 0.4141285717487335, 0.41810283064842224, 0.42024263739585876, 0.41276684403419495, 0.4568943679332733, 0.41739019751548767, 0.4515683352947235, 0.4663594961166382, 0.49342766404151917, 0.4165593087673187, 0.4583183825016022, 0.405213326215744, 0.3509773313999176, 0.4242330491542816, 0.4335017800331116]"
200,0.0964,0.092747,0.628628,"[0.44555971026420593, 0.49305006861686707, 0.4605618417263031, 0.47865068912506104, 0.4370424747467041, 0.5140877366065979, 0.4772800803184509, 0.43020716309547424, 0.4515380859375, 0.5176470875740051, 0.5184547305107117, 0.5031490325927734, 0.5006136894226074, 0.48247480392456055, 0.44601210951805115, 0.37805718183517456, 0.45516830682754517, 0.48556920886039734]"
300,0.075,0.075864,0.729341,"[0.49855995178222656, 0.5035285353660583, 0.46132010221481323, 0.5049284100532532, 0.46790915727615356, 0.49721914529800415, 0.43308210372924805, 0.43320196866989136, 0.4879097640514374, 0.5066753029823303, 0.46648097038269043, 0.5038718581199646, 0.5270731449127197, 0.4831911325454712, 0.4628978967666626, 0.4451074004173279, 0.4628675580024719, 0.5399840474128723]"
400,0.0605,0.064937,0.801285,"[0.544105052947998, 0.4627765417098999, 0.4875742793083191, 0.42662015557289124, 0.44724902510643005, 0.49008306860923767, 0.5060727000236511, 0.5440142154693604, 0.476777583360672, 0.4953424036502838, 0.45498669147491455, 0.526434063911438, 0.5402265787124634, 0.4958840012550354, 0.47787395119667053, 0.5011062622070312, 0.49486178159713745, 0.4368022084236145]"
500,0.0503,0.056427,0.844637,"[0.5678167939186096, 0.4964733421802521, 0.5101915001869202, 0.4370424747467041, 0.46235164999961853, 0.5041007399559021, 0.48153001070022583, 0.5120902061462402, 0.47557488083839417, 0.5179823637008667, 0.49282506108283997, 0.5354627966880798, 0.5696132779121399, 0.5034884810447693, 0.43584150075912476, 0.4949571490287781, 0.5142783522605896, 0.4762294888496399]"
600,0.0423,0.050116,0.873838,"[0.6171088814735413, 0.5658385753631592, 0.5472221374511719, 0.46860820055007935, 0.49315303564071655, 0.5587483644485474, 0.5149338841438293, 0.49501052498817444, 0.5027523040771484, 0.5045965313911438, 0.47799578309059143, 0.5883570909500122, 0.6369178891181946, 0.5252622961997986, 0.5231006145477295, 0.5480386018753052, 0.5472221374511719, 0.5004847049713135]"
700,0.0362,0.045596,0.880752,"[0.5501844882965088, 0.5384068489074707, 0.5132796764373779, 0.4753008782863617, 0.5072360038757324, 0.5627171993255615, 0.5367985367774963, 0.4538971781730652, 0.5097339153289795, 0.6320486664772034, 0.5057942867279053, 0.6135839223861694, 0.6481323838233948, 0.6001883149147034, 0.5414088368415833, 0.5083305239677429, 0.5373448729515076, 0.5310270190238953]"
800,0.0318,0.042273,0.895107,"[0.4877496361732483, 0.5133711695671082, 0.5698526501655579, 0.45102423429489136, 0.5247448086738586, 0.5804709196090698, 0.5519964098930359, 0.42054012417793274, 0.5076593160629272, 0.5041159987449646, 0.5112438201904297, 0.6123676300048828, 0.6512439250946045, 0.6164741516113281, 0.5238466858863831, 0.5177232623100281, 0.5802331566810608, 0.5418634414672852]"
900,0.0283,0.040454,0.904697,"[0.492436021566391, 0.5050657391548157, 0.5460120439529419, 0.4651445150375366, 0.5002237558364868, 0.5727825164794922, 0.5093144178390503, 0.46648097038269043, 0.4569246768951416, 0.44755083322525024, 0.5040168166160583, 0.5907201766967773, 0.6601723432540894, 0.6085940599441528, 0.5229178667068481, 0.46860820055007935, 0.5594705939292908, 0.5124562382698059]"
1000,0.0267,0.038847,0.907791,"[0.457318514585495, 0.512029230594635, 0.5531130433082581, 0.520968496799469, 0.5376483201980591, 0.5494593381881714, 0.47099512815475464, 0.5122427344322205, 0.42590367794036865, 0.4504198133945465, 0.5199174284934998, 0.6183196306228638, 0.6525735855102539, 0.6188380122184753, 0.5150635242462158, 0.4986705482006073, 0.5723045468330383, 0.5531432628631592]"


Trainer is attempting to log a value of "[0.399518758058548, 0.43092551827430725, 0.3949008285999298, 0.4141285717487335, 0.41810283064842224, 0.42024263739585876, 0.41276684403419495, 0.4568943679332733, 0.41739019751548767, 0.4515683352947235, 0.4663594961166382, 0.49342766404151917, 0.4165593087673187, 0.4583183825016022, 0.405213326215744, 0.3509773313999176, 0.4242330491542816, 0.4335017800331116]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
Trainer is attempting to log a value of "[0.44555971026420593, 0.49305006861686707, 0.4605618417263031, 0.47865068912506104, 0.4370424747467041, 0.5140877366065979, 0.4772800803184509, 0.43020716309547424, 0.4515380859375, 0.5176470875740051, 0.5184547305107117, 0.50314903259

TrainOutput(global_step=1250, training_loss=0.05433424139022827, metrics={'train_runtime': 1704.3153, 'train_samples_per_second': 23.059, 'train_steps_per_second': 0.733, 'total_flos': 1.03417499326464e+16, 'train_loss': 0.05433424139022827, 'epoch': 50.0})

In [66]:
optimized_thresholds = [0.5070643424987793, 0.5264949202537537, 0.5755283832550049, 0.48306921124458313, 0.5450738668441772, 0.5368289351463318, 0.5293850302696228, 0.5136227011680603, 0.43008750677108765, 0.6268097758293152, 0.4477621018886566, 0.6254380941390991, 0.4889925718307495, 0.6306849718093872, 0.5403478741645813, 0.5076555013656616, 0.580173671245575, 0.5481595396995544]
test_predictions = trainer.predict(dataset_test)

test_predictions

PredictionOutput(predictions=array([[-0.6665039 , -1.6064453 , -0.7109375 , ..., -1.7900391 ,
        -1.5341797 ,  1.3388672 ],
       [-0.29467773, -1.3876953 , -0.4868164 , ..., -0.86816406,
        -1.4794922 , -0.96484375],
       [-1.4755859 , -1.2558594 , -1.3740234 , ..., -2.0390625 ,
        -1.9238281 ,  0.8857422 ],
       ...,
       [-1.8037109 , -1.8349609 , -1.4179688 , ..., -1.9003906 ,
         1.6083984 , -1.1494141 ],
       [-1.0556641 , -0.9370117 , -1.7685547 , ..., -1.6328125 ,
        -1.4931641 , -0.8696289 ],
       [-0.8017578 ,  0.71484375, -1.4560547 , ..., -1.2724609 ,
        -0.9033203 , -1.1337891 ]], dtype=float32), label_ids=array([[0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0]]), metrics={'test_loss': 0.04075444117188454, 'test_macro_f1': 0.9095461720129643, 'test_thresholds': [0.6345432996749878, 0.5568

In [67]:
# Apply sigmoid to the logits to get probabilities
test_probabilities = torch.sigmoid(torch.tensor(test_predictions.predictions)).numpy()

# Apply the thresholds to get binary predictions
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over the classes
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

# Extract the true labels
true_labels = test_predictions.label_ids

In [68]:
# Trainer evaluate
trainer.evaluate(dataset_test)

Trainer is attempting to log a value of "[0.6345432996749878, 0.5568816065788269, 0.5927255153656006, 0.5181804895401001, 0.5538069605827332, 0.540014386177063, 0.46128976345062256, 0.5141106247901917, 0.45898517966270447, 0.4607134461402893, 0.4873989224433899, 0.6761924028396606, 0.640186071395874, 0.6457903385162354, 0.5495197772979736, 0.5265710353851318, 0.4774627983570099, 0.5624168515205383]" of type <class 'list'> for key "eval/thresholds" as a scalar. This invocation of Tensorboard's writer.add_scalar() is incorrect so we dropped this attribute.


{'eval_loss': 0.04075444117188454,
 'eval_macro_f1': 0.9095461720129643,
 'eval_thresholds': [0.6345432996749878,
  0.5568816065788269,
  0.5927255153656006,
  0.5181804895401001,
  0.5538069605827332,
  0.540014386177063,
  0.46128976345062256,
  0.5141106247901917,
  0.45898517966270447,
  0.4607134461402893,
  0.4873989224433899,
  0.6761924028396606,
  0.640186071395874,
  0.6457903385162354,
  0.5495197772979736,
  0.5265710353851318,
  0.4774627983570099,
  0.5624168515205383],
 'eval_runtime': 4.7974,
 'eval_samples_per_second': 54.822,
 'eval_steps_per_second': 3.544,
 'epoch': 50.0}

In [69]:
# Compute f1 metric
final_f1 = f1_score(true_labels, binary_predictions, average='macro')

print("\nFinal Evaluation:")
print(f"Macro F1 Score: {final_f1}")


Final Evaluation:
Macro F1 Score: 0.9018117112046001


In [70]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [71]:
with open('data/test_for_student.json', 'r', encoding='utf-8') as f:
    test_for_student = json.load(f)

In [72]:
def process_test_data(data):
    texts = []
    ids = []
    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        texts.append(text)
        ids.append(id)
    return texts, ids

In [73]:
test_texts, test_ids = process_test_data(test_for_student)

In [74]:
# Tokenize the test data
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to the same devic

In [75]:
# Make predictions using the model
model.to(device)
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits to probabilities using sigmoid function
sigmoid = torch.nn.Sigmoid()
test_probabilities = sigmoid(logits).cpu().numpy()

In [76]:
# Create a binary predictions array based on thresholds
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over each class
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

In [77]:
output_predictions = []
for i, test_id in enumerate(test_ids):
    row = [test_id] + binary_predictions[i].tolist()
    output_predictions.append(row)

# Convert the output to a DataFrame for easy saving
columns = ["id"] + classes
submission_df = pd.DataFrame(output_predictions, columns=columns)

# Save to a CSV file (optional)
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
