# Preprocessing

In [2]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# download NLTK
nltk.download("stopwords")
nltk.download("punkt")

# load csv
df = pd.read_csv("all.csv")
texts = df["main_content"].values

# stopwords list
stop_words = set(stopwords.words("english"))
r1 = '[0-9’!"#$%&\'()*+,-./:;<=>?@，。?★、…【】《》？“”‘’！[\\]^_`{|}~]+'
# preprocess function
def preprocess_text(text):
    # str_lower
    text = re.sub(r1, "",str(text))
    text = str(text).lower()
    
    # re
    text = re.sub(r"[^a-zA-Z]", " ", text)

    # word segmentation
    tokens = word_tokenize(text)

    # stopwords
    filtered_tokens = [token for token in tokens if token not in stop_words]

    # finish
    text = " ".join(filtered_tokens)

    return text

df["processed_content"] = [preprocess_text(text) for text in texts]
df.drop("main_content")
# new file for FT
#df1 = df[df["Related"].isin([0, 1])]
df.to_csv("all_processed.csv", index=False, encoding='utf_8_sig')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Zifu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Zifu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


KeyError: "['main_content'] not found in axis"

# FT-Classification

In [9]:
import pandas as pd
import cudf
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# read data
df = pd.read_csv('training_data/ki_sample_cases_20231004_master.csv')

# training and test definition
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)

# data
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_length):
        self.texts = df["main_content"].values
        self.labels = df["Related"].values
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, index):
        text = str(self.texts[index])
        label = self.labels[index]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="pt"
        )

        input_ids = encoding["input_ids"].squeeze()
        attention_mask = encoding["attention_mask"].squeeze()

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "label": torch.tensor(label, dtype=torch.long)
        }

# bert initialization
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

# parameters and optimizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

train_dataset = CustomDataset(train_df, tokenizer, max_length=128)
test_dataset = CustomDataset(test_df, tokenizer, max_length=128)

batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

epochs = 50
optimizer = AdamW(model.parameters(), lr=2e-5)
# Suggested new version for Adam
# optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5) 

# Training loop
for epoch in tqdm(range(epochs)):
    model.train()
    train_loss = 0.0
    correct_predictions = 0
    
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)
        
        optimizer.zero_grad()
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        loss = outputs.loss
        logits = outputs.logits
        
        _, predicted_labels = torch.max(logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
        
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
    
    train_accuracy = correct_predictions / len(train_dataset)
    avg_train_loss = train_loss / len(train_dataloader)
    
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {avg_train_loss:.4f}  Train Accuracy: {train_accuracy:.4f}")

# Testing
model.eval()
test_loss = 0.0
correct_predictions = 0

with torch.no_grad():
    for batch in test_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        logits = outputs.logits

        _, predicted_labels = torch.max(logits, dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()

        test_loss += loss.item()

test_accuracy = correct_predictions / len(test_dataset)
avg_test_loss = test_loss / len(test_dataloader)

print(f"Test Loss: {avg_test_loss:.4f}  Test Accuracy: {test_accuracy:.4f}")

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/50
Train Loss: 0.3512  Train Accuracy: 0.8914


  4%|▍         | 2/50 [01:41<40:17, 50.37s/it]

Epoch 2/50
Train Loss: 0.1820  Train Accuracy: 0.9324


  6%|▌         | 3/50 [02:29<38:39, 49.36s/it]

Epoch 3/50
Train Loss: 0.1099  Train Accuracy: 0.9612


  8%|▊         | 4/50 [03:15<36:51, 48.09s/it]

Epoch 4/50
Train Loss: 0.0739  Train Accuracy: 0.9767


 10%|█         | 5/50 [04:05<36:38, 48.85s/it]

Epoch 5/50
Train Loss: 0.0487  Train Accuracy: 0.9878


 12%|█▏        | 6/50 [04:53<35:33, 48.49s/it]

Epoch 6/50
Train Loss: 0.0353  Train Accuracy: 0.9889


 14%|█▍        | 7/50 [05:43<35:06, 48.99s/it]

Epoch 7/50
Train Loss: 0.0285  Train Accuracy: 0.9922


 16%|█▌        | 8/50 [06:34<34:39, 49.52s/it]

Epoch 8/50
Train Loss: 0.0156  Train Accuracy: 0.9956


 18%|█▊        | 9/50 [07:21<33:18, 48.75s/it]

Epoch 9/50
Train Loss: 0.0084  Train Accuracy: 0.9989


 20%|██        | 10/50 [08:09<32:21, 48.53s/it]

Epoch 10/50
Train Loss: 0.0067  Train Accuracy: 0.9989


 22%|██▏       | 11/50 [08:57<31:22, 48.28s/it]

Epoch 11/50
Train Loss: 0.0046  Train Accuracy: 0.9989


 24%|██▍       | 12/50 [09:44<30:27, 48.09s/it]

Epoch 12/50
Train Loss: 0.0032  Train Accuracy: 0.9989


 26%|██▌       | 13/50 [10:33<29:45, 48.26s/it]

Epoch 13/50
Train Loss: 0.0020  Train Accuracy: 1.0000


 28%|██▊       | 14/50 [11:23<29:14, 48.72s/it]

Epoch 14/50
Train Loss: 0.0015  Train Accuracy: 1.0000


 30%|███       | 15/50 [12:15<29:06, 49.89s/it]

Epoch 15/50
Train Loss: 0.0012  Train Accuracy: 1.0000


 32%|███▏      | 16/50 [13:01<27:36, 48.71s/it]

Epoch 16/50
Train Loss: 0.0010  Train Accuracy: 1.0000


 34%|███▍      | 17/50 [13:46<26:11, 47.61s/it]

Epoch 17/50
Train Loss: 0.0008  Train Accuracy: 1.0000


 36%|███▌      | 18/50 [14:33<25:12, 47.27s/it]

Epoch 18/50
Train Loss: 0.0007  Train Accuracy: 1.0000


 38%|███▊      | 19/50 [15:20<24:26, 47.29s/it]

Epoch 19/50
Train Loss: 0.0007  Train Accuracy: 1.0000


 40%|████      | 20/50 [16:04<23:04, 46.15s/it]

Epoch 20/50
Train Loss: 0.0006  Train Accuracy: 1.0000


 42%|████▏     | 21/50 [16:50<22:18, 46.17s/it]

Epoch 21/50
Train Loss: 0.0005  Train Accuracy: 1.0000


 44%|████▍     | 22/50 [17:37<21:41, 46.47s/it]

Epoch 22/50
Train Loss: 0.0005  Train Accuracy: 1.0000


 46%|████▌     | 23/50 [18:26<21:17, 47.30s/it]

Epoch 23/50
Train Loss: 0.0005  Train Accuracy: 1.0000


 48%|████▊     | 24/50 [19:11<20:08, 46.47s/it]

Epoch 24/50
Train Loss: 0.0004  Train Accuracy: 1.0000


 50%|█████     | 25/50 [19:58<19:30, 46.84s/it]

Epoch 25/50
Train Loss: 0.0004  Train Accuracy: 1.0000


 52%|█████▏    | 26/50 [20:42<18:20, 45.84s/it]

Epoch 26/50
Train Loss: 0.0003  Train Accuracy: 1.0000


 54%|█████▍    | 27/50 [21:26<17:24, 45.41s/it]

Epoch 27/50
Train Loss: 0.0003  Train Accuracy: 1.0000


 56%|█████▌    | 28/50 [22:10<16:28, 44.93s/it]

Epoch 28/50
Train Loss: 0.0003  Train Accuracy: 1.0000


 58%|█████▊    | 29/50 [22:55<15:43, 44.94s/it]

Epoch 29/50
Train Loss: 0.0003  Train Accuracy: 1.0000


 60%|██████    | 30/50 [23:42<15:09, 45.47s/it]

Epoch 30/50
Train Loss: 0.0003  Train Accuracy: 1.0000


 62%|██████▏   | 31/50 [24:25<14:09, 44.70s/it]

Epoch 31/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 64%|██████▍   | 32/50 [25:07<13:08, 43.82s/it]

Epoch 32/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 66%|██████▌   | 33/50 [25:49<12:16, 43.30s/it]

Epoch 33/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 68%|██████▊   | 34/50 [26:34<11:41, 43.83s/it]

Epoch 34/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 70%|███████   | 35/50 [27:16<10:49, 43.33s/it]

Epoch 35/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 72%|███████▏  | 36/50 [27:54<09:43, 41.66s/it]

Epoch 36/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 74%|███████▍  | 37/50 [28:36<09:05, 41.96s/it]

Epoch 37/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 76%|███████▌  | 38/50 [29:17<08:17, 41.45s/it]

Epoch 38/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 78%|███████▊  | 39/50 [30:02<07:50, 42.78s/it]

Epoch 39/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 80%|████████  | 40/50 [30:41<06:54, 41.47s/it]

Epoch 40/50
Train Loss: 0.0002  Train Accuracy: 1.0000


 82%|████████▏ | 41/50 [31:28<06:28, 43.15s/it]

Epoch 41/50
Train Loss: 0.0001  Train Accuracy: 1.0000


 84%|████████▍ | 42/50 [32:12<05:47, 43.40s/it]

Epoch 42/50
Train Loss: 0.0001  Train Accuracy: 1.0000


 86%|████████▌ | 43/50 [32:53<05:00, 42.87s/it]

Epoch 43/50
Train Loss: 0.0001  Train Accuracy: 1.0000


 88%|████████▊ | 44/50 [33:41<04:24, 44.13s/it]

Epoch 44/50
Train Loss: 0.0001  Train Accuracy: 1.0000


 90%|█████████ | 45/50 [34:28<03:45, 45.18s/it]

Epoch 45/50
Train Loss: 0.0004  Train Accuracy: 1.0000


 92%|█████████▏| 46/50 [35:13<03:00, 45.06s/it]

Epoch 46/50
Train Loss: 0.0433  Train Accuracy: 0.9845


 94%|█████████▍| 47/50 [35:54<02:11, 43.95s/it]

Epoch 47/50
Train Loss: 0.0230  Train Accuracy: 0.9956


 96%|█████████▌| 48/50 [36:37<01:27, 43.61s/it]

Epoch 48/50
Train Loss: 0.0161  Train Accuracy: 0.9956


 98%|█████████▊| 49/50 [37:22<00:43, 43.96s/it]

Epoch 49/50
Train Loss: 0.0013  Train Accuracy: 1.0000


100%|██████████| 50/50 [38:10<00:00, 45.80s/it]

Epoch 50/50
Train Loss: 0.0007  Train Accuracy: 1.0000





Test Loss: 0.1633  Test Accuracy: 0.9604


In [10]:
# save model
torch.save(model.state_dict(), "model_random.pth")

# Use Model for Prediction

In [5]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.load_state_dict(torch.load("model_random.pth"))
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# load processed data
for i in range(29):
    print(i)
    df = pd.read_csv(f'file_split/split_{i+1}.csv')
    prediction_texts = df["main_content"].astype(str).tolist()

    # use BertTokenizer for encoding
    encoded_inputs = tokenizer(prediction_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
    input_ids = encoded_inputs["input_ids"].to(device)
    attention_mask = encoded_inputs["attention_mask"].to(device)

# predict
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predicted_labels = torch.argmax(logits, dim=1)

    # add & save predicted label to Dataframe
    df["predicted_related"] = predicted_labels.cpu().numpy()
    df.to_csv(f'unduplicate/related_results_{i+1}.csv', encoding='utf_8_sig')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28


FileNotFoundError: [Errno 2] No such file or directory: 'file_split/split_29.csv'

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch
import pandas as pd
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, ignore_mismatched_sizes=True)
model.load_state_dict(torch.load("model_random.pth"), strict=False, weights_only=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# load processed data


df = pd.read_csv(f'training_data/ki_sample_cases_20231004_master.csv')
prediction_texts = df["main_content"].astype(str).tolist()

# use BertTokenizer for encoding
encoded_inputs = tokenizer(prediction_texts, padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = encoded_inputs["input_ids"].to(device)
attention_mask = encoded_inputs["attention_mask"].to(device)

# predict
with torch.no_grad():
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predicted_labels = torch.argmax(logits, dim=1)

    # add & save predicted label to Dataframe
df["predicted_related"] = predicted_labels.cpu().numpy()
df.to_csv(f'validation_1003.csv', encoding='utf_8_sig')

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load("model_random.pth"), strict=False)


In [4]:
from sklearn.metrics import confusion_matrix

# Assuming binary classification: 1 = related, 0 = unrelated
original_labels = df["Related"].values
predicted_labels = df["predicted_related"].values

# Get confusion matrix components
tn, fp, fn, tp = confusion_matrix(original_labels, predicted_labels).ravel()

# Total counts
actual_related = tp + fn
actual_unrelated = tn + fp
predicted_related = tp + fp
predicted_unrelated = tn + fn

# Calculated metrics
accuracy = (tp + tn) / (tp + tn + fp + fn)
precision = tp / (tp + fp) if (tp + fp) > 0 else 0  # PPV
npv = tn / (tn + fn) if (tn + fn) > 0 else 0         # NPV
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0 # Recall
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0

# Print confusion-style report
print(f"\t\t\tActual related ({actual_related})\tActual unrelated ({actual_unrelated})\tMeasures")
print(f"Predicted related ({predicted_related})\tTrue Positive ({tp})\t\tFalse Positive ({fp})\t\tPPV: {precision * 100:.1f}%")
print(f"Predicted unrelated ({predicted_unrelated})\tFalse Negative ({fn})\t\tTrue Negative ({tn})\t\tNPV: {npv * 100:.1f}%")
print(f"Measures\t\tSensitivity: {sensitivity * 100:.2f}%\tSpecificity: {specificity * 100:.2f}%\tAccuracy {accuracy * 100:.2f}%")


			Actual related (90)	Actual unrelated (913)	Measures
Predicted related (90)	True Positive (88)		False Positive (2)		PPV: 97.8%
Predicted unrelated (913)	False Negative (2)		True Negative (911)		NPV: 99.8%
Measures		Sensitivity: 97.78%	Specificity: 99.78%	Accuracy 99.60%
