In [3]:
import json
import torch
import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification

classes = ['CE', 'ENV', 'BME', 'PE', 'METAL', 'ME', 'EE', 'CPE', 'OPTIC', 'NANO', 'CHE',
           'MATENG', 'AGRI', 'EDU', 'IE', 'SAFETY', 'MATH', 'MATSCI']

# Load model and tokenizer from Hugging Face repository
model_name = "thanadetch/research_classification_bert"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# Function to clean text (preprocessing)
def clean_text(text):
    # More comprehensive text cleaning
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s\.\-\,]', '', text)  # Keep periods, hyphens, and commas
    text = text.lower().strip()
    return text


# Sample test data (replace this with your actual test data)
# In practice, this could be loaded from a file or provided through an API
with open('data/test_for_student.json', 'r', encoding='utf-8') as f:
    test_for_student = json.load(f)


# Preprocess the test data (cleaning and preparing the text)
def process_test_data(data):
    texts = []
    ids = []
    for id, info in data.items():
        title = clean_text(info['Title'])
        abstract = clean_text(info['Abstract'])
        text = f"{title} [SEP] {abstract}"
        texts.append(text)
        ids.append(id)
    return texts, ids


# Process the test data
test_texts, test_ids = process_test_data(test_for_student)

# Tokenize the test data for the model
inputs = tokenizer(test_texts, padding=True, truncation=True, return_tensors="pt")
inputs = {key: value.to(device) for key, value in inputs.items()}

# Make predictions on the test data
model.to(device)
model.eval()
with torch.no_grad():
    logits = model(**inputs).logits

# Convert logits to probabilities using sigmoid function
sigmoid = torch.nn.Sigmoid()
test_probabilities = sigmoid(logits).cpu().numpy()
optimized_thresholds = [0.61925291046766, 0.44774172613880536, 0.41414180606778695,
                        0.57874606204562, 0.60049749592320619, 0.45082401185132921,
                        0.53109708623883023, 0.45403000155140193, 0.60902179300296211,
                        0.42565307019191844, 0.50018945232073527, 0.59107437696539019,
                        0.47245547015847508, 0.55197602234063902, 0.46691181553592994,
                        0.62757920445700602, 0.52659046101203611, 0.48735158959652163]
# optimized_thresholds = [0.5070643424987793, 0.5264949202537537, 0.5755283832550049, 0.48306921124458313, 0.5450738668441772, 0.5368289351463318, 0.5293850302696228, 0.5136227011680603, 0.43008750677108765, 0.6268097758293152, 0.4477621018886566, 0.6254380941390991, 0.4889925718307495, 0.6306849718093872, 0.5403478741645813, 0.5076555013656616, 0.580173671245575, 0.5481595396995544]

# Create a binary predictions array based on thresholds
binary_predictions = np.zeros(test_probabilities.shape)
for i in range(len(classes)):  # Iterate over each class
    binary_predictions[:, i] = (test_probabilities[:, i] >= optimized_thresholds[i]).astype(int)

output_predictions = []
for i, test_id in enumerate(test_ids):
    row = [test_id] + binary_predictions[i].tolist()
    output_predictions.append(row)

# Convert the output to a DataFrame for easy saving
columns = ["id"] + classes
submission_df = pd.DataFrame(output_predictions, columns=columns)

# Save to a CSV file (optional)
submission_df.to_csv("submission.csv", index=False)
print("Predictions saved to submission.csv")

Predictions saved to submission.csv
