In [1]:
import numpy as np
import pandas as pd
import torch
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, StratifiedKFold
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.model_selection import KFold
import datetime

In [10]:
batch_size = 16
def tokenize_data(texts, tokenizer, max_length=512):
    return tokenizer(
        texts,
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors='pt'
    )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
baseline_csv = "baseline.csv"
output_csv = "baselineVSmodel_predictions.csv"
model_path = "model_2024-11-29_11-36-12.pth"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

baseline_df = pd.read_csv(baseline_csv)

checkpoint = torch.load(model_path, map_location=device)
print(checkpoint.keys())
# Initialize the model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(checkpoint['label_mapping'])
)
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)
model.eval()

# Load tokenizer and label mapping
tokenizer = checkpoint['tokenizer']
filtered_labels_list = checkpoint['label_mapping']

encodings = tokenize_data(baseline_df['Example Description'].tolist(), tokenizer)
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
data_loader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=16)  # Adjust batch_size if needed

# Make predictions
predictions = []
with torch.no_grad():
    for batch in tqdm(data_loader, desc="Making Predictions"):
        input_ids, attention_mask = [item.to(device) for item in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().tolist())

# Map predictions to artifact IDs
baseline_df['model_2024-11-29_11-36-12 Prediction'] = [
    filtered_labels_list[pred] for pred in predictions
]

# Keep baseline predictions for comparison
baseline_df['baseline Prediction'] = baseline_df['Prediction']

# Save the predictions to CSV
baseline_df[['Artifact Id', 'baseline Prediction', 'model_2024-11-29_11-36-12 Prediction']].to_csv(output_csv, index=False)
print(f"Predictions saved to {output_csv}")

dict_keys(['model_state_dict', 'optimizer_state_dict', 'scheduler_state_dict', 'tokenizer', 'label_mapping'])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Making Predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Predictions saved to baselineVSmodel_predictions.csv


In [3]:
baseline_csv = "baseline.csv"
output_csv = "baselineVSmodel_predictions.csv"
model_path = "model_2024-11-26_21-46-02.pth" 
baseline_df = pd.read_csv(baseline_csv)

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=8
)
model.load_state_dict(torch.load(model_path, map_location=device))
model.to(device)
model.eval()

encodings = tokenize_data(baseline_df['Example Description'].tolist())
dataset = TensorDataset(encodings['input_ids'], encodings['attention_mask'])
data_loader = DataLoader(dataset, sampler=SequentialSampler(dataset), batch_size=batch_size)

predictions = []


with torch.no_grad():
    for batch in tqdm(data_loader, desc="Making Predictions"):
        input_ids, attention_mask = [item.to(device) for item in batch]
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)
        predictions.extend(preds.cpu().tolist())

baseline_df['model_2024-11-26_21-46-02 Prediction'] = [baseline_df['Artifact Id'].iloc[i] for i in predictions]

baseline_df['baseline Prediction'] = baseline_df['Prediction']

baseline_df[['Artifact Id', 'baseline Prediction', 'model_2024-11-26_21-46-02 Prediction']].to_csv(output_csv, index=False)
print(f"Predictions saved to {output_csv}")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Making Predictions:   0%|          | 0/1 [00:00<?, ?it/s]

Predictions saved to baselineVSmodel_predictions.csv


In [5]:
true_labels = baseline_df['Artifact Id'].tolist()
true_labels = [label.replace('d3f:', '') for label in baseline_df['Artifact Id'].tolist()]

baseline_predictions = baseline_df['baseline Prediction'].tolist()
model_predictions = baseline_df['model_2024-11-26_21-46-02 Prediction'].tolist()
model_predictions = [label.replace('d3f:', '') for label in baseline_df['model_2024-11-26_21-46-02 Prediction'].tolist()]

# Calculate F1 Score for baseline model (assuming the labels are categorical)
f1_baseline = f1_score(true_labels, baseline_predictions, average='weighted')
f1_model = f1_score(true_labels, model_predictions, average='weighted')

# Calculate accuracy for both models
accuracy_baseline = np.sum(np.array(baseline_predictions) == np.array(true_labels)) / len(true_labels) * 100
accuracy_model = np.sum(np.array(model_predictions) == np.array(true_labels)) / len(true_labels) * 100

# Print out the results
print(f"Baseline Model F1 Score (Weighted): {f1_baseline:.4f}")
print(f"Baseline Model Accuracy: {accuracy_baseline:.2f}%")
print(f"Model 2024-11-26_21-46-02 F1 Score (Weighted): {f1_model:.4f}")
print(f"Model 2024-11-26_21-46-02 Accuracy: {accuracy_model:.2f}%")

Baseline Model F1 Score (Weighted): 0.1667
Baseline Model Accuracy: 18.75%
Model 2024-11-26_21-46-02 F1 Score (Weighted): 0.1500
Model 2024-11-26_21-46-02 Accuracy: 18.75%
