In [13]:
!pip install transformers datasets evaluate accelerate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->acce

In [31]:
import pandas as pd
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
import evaluate
import numpy as np
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

In [41]:
def get_qas(df,leave_id_out=True):
  """
  Cleans dataframes for later usage

    Args:
        df (DataFrame): The given DataFrame.
        leave_id_out (bool): Boolean switch for the id column.

    Returns:
        DataFrame: the cleaned DataFrame.
  """
  answers = df[df['ParentId'].notna()]
  questions = df[df['ParentId'].isna()]

  # drop unnecessary columns
  q = questions.drop(columns=['ParentId','PostTypeId','OwnerUserId','CommentCount','ContentLicense','CreationDate','LastActivityDate','ViewCount'])
  a = answers.drop(columns=['PostTypeId','OwnerUserId','CommentCount','ContentLicense','Title','Tags','AnswerCount','ViewCount','AcceptedAnswerId','CreationDate','LastActivityDate'])

  # merge
  merged = pd.merge(q, a, left_on='Id', right_on='ParentId', suffixes=('_question', '_answer'))
  merged = merged[['Id_question', 'Body_question', 'Body_answer', 'Score_answer']]

  merged.columns = [ 'question_id','question', 'answer', 'score']

  if leave_id_out:
    merged = merged.drop(columns=['question_id'])

  return merged

def normalize_scores(df,leave_max_out=True):
  """
  Normalizes scores based on the highest scored answer per question.

    Args:
        df (DataFrame): The given DataFrame.
        leave_max_out (bool): Boolean switch for the max_score column.

    Returns:
        DataFrame: the normalized DataFrame.
  """
  # max_score by question
  df['max_score'] = df.groupby('question')['score'].transform('max')

  # normalize
  df['normalized_score'] = df['score'] / df['max_score']

  df = df.drop(columns=['score'])

  if leave_max_out:
    df = df.drop(columns=['max_score'])

  df.rename(columns={'normalized_score':'score'},inplace=True)

  return df

def preprocess_function(examples):
    return tokenizer(examples['question'], examples['answer'], truncation=True)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [37]:
# using latin SE here
df = pd.read_csv('https://github.com/yarathealmighty/dumps/blob/main/data/parsed/latin.csv?raw=True')

latin = get_qas(df)
latin = normalize_scores(latin)
latin

# TODO check out different ways to normalize single answers

Unnamed: 0,question,answer,score
0,I was taught that one can use the '-que' suffi...,"The way I was taught was that, as a general ru...",1.000000
1,I was taught that one can use the '-que' suffi...,"In Ecclesiastical Latin ""-que"" would be used i...",0.340909
2,I was taught that one can use the '-que' suffi...,"Both et and -que can often translate ""and"". Th...",0.250000
3,I was taught that one can use the '-que' suffi...,James Kingsbery's answer is exactly correct. I...,0.159091
4,I was taught that one can use the '-que' suffi...,The que suffix has a usage example with moment...,0.045455
...,...,...,...
9328,"So, I'm a PhD student working on the history o...",I would advocate for a freer translation: I th...,0.250000
9329,I came across this on google translate and I l...,"No. Anima is the Latin word for soul, apart fr...",1.000000
9330,"In a previous question of mine, What diphthong...","It isn’t traditionally used in normal writing,...",1.000000
9331,"In a previous question of mine, What diphthong...","Most of them don't need to be marked, since th...",0.500000


In [38]:
#se as in StackExchange
se = Dataset.from_pandas(latin)
train_test_split = se.train_test_split(test_size=0.2, seed=42)

se = DatasetDict({
    'train': train_test_split['train'],
    'test': train_test_split['test']
})

In [39]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

tokenized_se = se.map(preprocess_function, batched=True)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

accuracy = evaluate.load("accuracy")

Map:   0%|          | 0/7466 [00:00<?, ? examples/s]

Map:   0%|          | 0/1867 [00:00<?, ? examples/s]