In [1]:
import torch
import pandas as pd
import numpy as np
from dataset import CustomDataset
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
import torch.nn as nn
from model import CustomClassifier
from utils import calculate_multilabel_metrics
from tqdm import tqdm

In [2]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

train = pd.read_pickle('dataset/train.pkl')
val = pd.read_pickle('dataset/valid.pkl')
test = pd.read_pickle('dataset/test.pkl')

data = pd.concat([train, val, test], ignore_index=True)

tokenizer = AutoTokenizer.from_pretrained('klue/roberta-small')
dataset = CustomDataset(data, tokenizer)
dataloader = DataLoader(dataset, batch_size=64, shuffle=False)  

model = CustomClassifier('klue/roberta-small', 7, device)
model.load_state_dict(torch.load('/home/woongchan/Workspace/지재권/ipc_section_classification/results2/model_state_dict.pth'))

Some weights of RobertaModel were not initialized from the model checkpoint at klue/roberta-small and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


<All keys matched successfully>

In [3]:
total_loss = 0
total_section_acc = []
total_section_precison = []
total_section_recall = []
total_section_f1 = []
y_preds = []

model.eval()
with torch.no_grad():
    with tqdm(dataloader, total=len(dataloader)) as t:
        for batch in t:
            input_ids, attention_mask = batch['input_ids'].to(device), batch['attention_mask'].to(device)
            section_labels = batch['section'].to(device)

            outputs = model(input_ids, attention_mask)
            
            final_outputs = (outputs['section_output'].detach().cpu().numpy() > 0.5).astype(int)
            y_preds.append(final_outputs)
            
            for i in range(len(final_outputs)):
                if final_outputs[i].sum() == 0:
                    final_outputs[i][np.argmax(outputs['section_output'].detach().cpu().numpy()[i])] = 1
                    
            eval_dict = calculate_multilabel_metrics(section_labels.detach().cpu().numpy(), final_outputs)
            total_batch_acc,total_batch_precision,total_batch_recall,total_batch_f1 = eval_dict['accuracy'], eval_dict['precision'], eval_dict['recall'], eval_dict['f1_score']
            total_section_acc.append(total_batch_acc)
            total_section_precison.append(total_batch_precision)
            total_section_recall.append(total_batch_recall)
            total_section_f1.append(total_batch_f1)
            
            t.set_postfix(Acc=total_batch_acc, Prec=total_batch_precision, Rec=total_batch_recall, F1=total_batch_f1)

  1%|          | 147/12870 [05:13<7:32:54,  2.14s/it, Acc=0.625, F1=0.714, Prec=0.781, Rec=0.658]


KeyboardInterrupt: 

In [None]:
y_preds_flat = [item for sublist in y_preds for item in sublist]
test['section_pred'] = y_preds_flat
test.to_pickle('dataset/result.pkl')

Unnamed: 0,abstract,section,section_pred
0,조미료 제조시에 부산물로 얻어지는 발효 폐액과 인산암모니움염의 혼합 현탁액을 이용하...,"[0, 1, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0]"
1,구성본문에 설명하고 도면에 예시한 바와 같이 기록매체로 형성한 정전 잠상을 토너로...,"[0, 0, 0, 0, 0, 0, 1]","[0, 1, 0, 0, 0, 0, 0]"
2,구성불포화 폴리에스테르 수지A 진주 광택 안료B 경화 촉매C 알루미늄 알콕시드 및또...,"[0, 1, 0, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0]"
4,목적 안지오텐신 전환효소를 저해함으로써 안지오텐신 I이 안지오텐신 II로의 전환을 ...,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0]"
5,디설파이드트리설파이드 혼합물을 함유한 개량된 하이드록실화 액체 모노머를 사용하여 자...,"[0, 0, 1, 0, 0, 0, 0]","[0, 0, 1, 0, 0, 0, 0]"


In [None]:
# Mapping from index to label
index_to_label = {0: 'A', 1: 'B', 2: 'C', 3: 'D', 4: 'E', 5: 'F', 6: 'G'}

# Function to convert binary list to label list
def binary_to_labels(binary_list):
    labels = [index_to_label[idx] for idx, value in enumerate(binary_list) if value == 1]
    return labels

# Apply the function to the 'section' and 'section_pred' columns
data['section_labels'] = data['section'].apply(binary_to_labels)
data['section_pred_labels'] = data['section_pred'].apply(binary_to_labels)


In [None]:
test['section'] = test['section'].apply(np.array)

In [None]:
test.section.apply(lambda x: x.sum()).value_counts()

section
1    187395
2      3499
3        29
Name: count, dtype: int64

In [None]:
test.section_pred.apply(lambda x: x.sum()).value_counts()

section_pred
1    190852
2        71
Name: count, dtype: int64

In [None]:
y_true = np.concatenate(test['section'].values)
y_pred = np.concatenate(test['section_pred'].values)

In [None]:
eval_dict = calculate_multilabel_metrics(y_true, y_pred)
eval_dict

{'accuracy': 0.8699565494241882,
 'precision': 0.8699565494241882,
 'recall': 0.8699565494241882,
 'f1_score': 0.8699565494241882}

In [None]:
from sklearn.metrics import multilabel_confusion_matrix

# Assuming the DataFrame 'df' is already defined and contains the multi-label predictions
# The multilabel_confusion_matrix function can be used directly on the 'section' and 'section_pred' columns

# Convert lists to arrays if not already done
test['section'] = test['section'].apply(np.array)
test['section_pred'] = test['section_pred'].apply(np.array)

# Extract true and predicted labels
y_true = np.array(test['section'].tolist())
y_pred = np.array(test['section_pred'].tolist())

# Calculate the confusion matrix for each label
confusion_matrices = multilabel_confusion_matrix(y_true, y_pred)

confusion_matrices


array([[[149308,   8908],
        [ 19478,  13229]],

       [[ 98160,  39495],
        [ 18448,  34820]],

       [[154339,  10914],
        [  8659,  17011]],

       [[182246,   1176],
        [  5135,   2366]],

       [[171406,   4285],
        [  8848,   6384]],

       [[149963,  10837],
        [ 16187,  13936]],

       [[151403,   9541],
        [ 11887,  18092]]])

In [None]:
test.section_pred.value_counts()