In [16]:
from transformers import AutoModelForSequenceClassification,  AutoTokenizer
import torch

In [None]:
import os
files = []
for dirname, _, filenames in os.walk('./eedi-mining-misconceptions-in-mathematics'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        files.append(os.path.join(dirname, filename))
misconceptions_filename = files[0]
train_filename = files[3]
test_filename = files[2]       

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

misc_con_data = pd.read_csv(misconceptions_filename)
misc_con_data.head()

In [None]:
train_data = pd.read_csv(train_filename)
train_data.head()

In [None]:
# Get the first row
first_row = train_data.iloc[0]
print(first_row)
misconception_row = misc_con_data[(misc_con_data.MisconceptionId==first_row.MisconceptionDId)]
print(misconception_row)
questionText = f"{first_row.ConstructName} {first_row.SubjectName} {first_row.QuestionText} {first_row.AnswerDText}"
questionText.replace('\n',' ')
print (questionText)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

# import the dataset

In [None]:
from datasets import load_dataset, Dataset

# Replace this with your dataset
# dataset = load_dataset('imdb')

# Convert DataFrame to Dataset
dataset = Dataset.from_pandas(train_data)

# Verify the conversion
print(dataset)

## Preprocess data

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tbs17/MathBERT")

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation=True, padding=True)

tokenized_dataset = train_data.map(preprocess_function, batched=True)


## Fine-tune the Model:

In [None]:
from transformers import TrainingArguments, Trainer, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("tbs17/MathBERT", num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['test'],
)

trainer.train()


In [None]:
# Load MathBERT model and tokenizer
model_name ='tbs17/MathBERT'
#tokenizer = BertTokenizer.from_pretrained(model_name,output_hidden_states=True)
#model = BertModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)


In [None]:


# Sample math expression
expression = questionText

# Tokenize the input
inputs = tokenizer(expression, return_tensors="pt")
print(inputs)

# Get predictions
with torch.no_grad():
    outputs = model(**inputs)

# Outputs are the model's predictions
print(outputs)


In [None]:
import torch.nn.functional as F

# Get the logits from the model outputs
logits = outputs.logits

# Convert logits to probabilities
probs = F.softmax(logits, dim=-1)

# Get the predicted class (the class with the highest probability)
predicted_class = torch.argmax(probs, dim=-1)

# Map the predicted class to the corresponding label (assuming you have a list of labels)
labels = misc_con_data['MisconceptionName'].tolist()

predicted_label = labels[predicted_class.item()]

print(f"The model predicts: {predicted_label}")