In [None]:
# !rm -rf /kaggle/working/*

# Importing Libraries

In [None]:
import re
import torch

import pandas as pd
import numpy as np

import datasets
from datasets import Dataset

# !pip install /kaggle/input/pyspellchecker/pyspellchecker-0.8.0-py3-none-any.whl
# from spellchecker import SpellChecker

from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Globals

In [None]:
CHECKPOINT_PATH = '/kaggle/input/deberta-v3-large-finetuned/'
MAX_LENGTH = 512
TEST_CSV_INPUT_PATH = '/kaggle/input/llm-detect-ai-generated-text/test_essays.csv'

# Testing

In [None]:
def preprocess_function(ds: Dataset, text_col: str = 'text'):
    return tokenizer(ds[text_col], max_length=MAX_LENGTH, padding=True, truncation=True)

In [None]:
def correct_text_spelling(text):
    spell = SpellChecker()
    words = re.findall(r'\b\w+\b', text)
    misspelled = spell.unknown(words)
    corrected_text = text
    for word in misspelled:
        if spell.correction(word):
            corrected_text = corrected_text.replace(word, spell.correction(word))
    return corrected_text

def correct_spelling(df, text_col: str = 'text') -> pd.DataFrame:
    df_ = df.copy()
    df_['corrected'] = df_[text_col].apply(correct_text_spelling)
    return df_['corrected']

In [None]:
tokenizer = AutoTokenizer.from_pretrained(CHECKPOINT_PATH, use_fast=True)

In [None]:
def get_probas_direct(model, tokenizer, ds_enc: Dataset) -> np.ndarray:
    inputs = tokenizer(ds_enc['text'], truncation=True, return_tensors='pt', padding=True)
    
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
    
    probs = torch.softmax(logits, dim=-1)
    
    return probs.numpy()[:, 1]

In [None]:
df_test = pd.read_csv(TEST_CSV_INPUT_PATH)

# df_test['text'] = correct_spelling(df_test, 'text')

df_test = df_test[['id', 'text']]
df_test_id = df_test['id']

ds_test = Dataset.from_pandas(df_test)
ds_test_enc = ds_test.map(preprocess_function, batched=True)

model = AutoModelForSequenceClassification.from_pretrained(CHECKPOINT_PATH)

ds_test_proba = get_probas_direct(model, tokenizer, ds_test_enc)

## Submission

In [None]:
results = pd.DataFrame({'id': df_test_id, 'generated': ds_test_proba})
results.to_csv('submission.csv', index=False)

In [None]:
results