In [1]:
import torch
from transformers import AutoTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from torch.utils.data import Dataset, DataLoader, random_split
import gc
from tqdm import tqdm
from sklearn.metrics import accuracy_score

# Check if CUDA is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")

# Load a smaller subset of sentences and labels for initial testing
def read_sentences_and_labels(src_path, tgt_path, limit=5):  # Limit to 20,000 for reduced training time
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]
    sentences = incorrect_sentences[:limit] + correct_sentences[:limit]
    labels = [1] * min(len(incorrect_sentences), limit) + [0] * min(len(correct_sentences), limit)
    return sentences, labels

  warn(f"Failed to load image Python extension: {e}")


Using device: cuda


In [2]:
# Load sentences and labels (limited subset)
src_path = "wikiExtractsData/data/train_merge.src"
tgt_path = "wikiExtractsData/data/train_merge.tgt"
sentences, labels = read_sentences_and_labels(src_path, tgt_path)
print(len(sentences), sentences, labels)




# Tokenize sentences in smaller batches to avoid memory overload
batch_size = 500
input_ids_list = []
attention_mask_list = []

for i in tqdm(range(0, len(sentences), batch_size), desc="Batch Tokenizing Sentences"):
    batch_sentences = sentences[i:i+batch_size]
    tokenized_batch = tokenizer(
        batch_sentences,
        padding="max_length",
        truncation=True,
        max_length=64,  # Reduced max length for faster processing
        return_tensors="pt"
    )
    input_ids_list.append(tokenized_batch["input_ids"])
    attention_mask_list.append(tokenized_batch["attention_mask"])

# Concatenate tokenized tensors
tokenized_inputs = {
    "input_ids": torch.cat(input_ids_list, dim=0),
    "attention_mask": torch.cat(attention_mask_list, dim=0)
}
del input_ids_list, attention_mask_list  # Free up memory after concatenation
gc.collect()  # Explicit garbage collection

# Convert labels to a tensor
labels = torch.tensor(labels, dtype=torch.long)

Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3319468.67it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3472020.49it/s]


10 ['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा थी .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा हैं , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .', 'डीएनए क्षति और उत्परिवर्तन के बीच अंतर करना अत्यंत महत्वपूर्ण हैं .', 'यह खाना बनाने के काम आती है .', 'फ़िल्म का एल्बम अधिकार ज़ी म्यूजिक कंपनी द्वारा अधिगृहीत किए गए थे , और एल्बम को ११ मार्च २०१७ को रिलीज़ किया गया था .', 'तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा था .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा है , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .', 'डीएनए क्षति और उत्परिवर्तन के बीच अंतर करना अत्यंत महत्वपूर्ण है .', 'यह खाना बनाने के काम आता है .', 'फ़िल्म के एल्बम अधिकार ज़ी म्यूजिक कंपनी द्वारा अधिगृहीत किए गए थे , और एल्बम को ११ मार्च २०१७ को रिलीज़ किया गया था .'] [1, 1, 1, 1, 1, 0, 0, 0, 0, 0]


Batch Tokenizing Sentences: 100%|██████████| 1/1 [00:00<00:00, 29.76it/s]


In [3]:

# Define Dataset with Pre-tokenized Inputs
class PreTokenizedSentenceDataset(Dataset):
    def __init__(self, tokenized_inputs, labels):
        self.tokenized_inputs = tokenized_inputs
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_inputs.items()}
        item["labels"] = self.labels[idx]
        return item

# Create the dataset
dataset = PreTokenizedSentenceDataset(tokenized_inputs, labels)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])


In [4]:

# Load pre-trained DistilBERT model for sequence classification and move it to the GPU if available
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-multilingual-cased",
    num_labels=2
).to(device)

# Define the compute_metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=-1)
    accuracy = accuracy_score(labels, predictions)
    return {
        'accuracy': accuracy
    }

# Define Training Arguments with Improvements
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    fp16=True,
    gradient_accumulation_steps=4
)

# Define the Trainer with compute_metrics and early stopping
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=1)]
)

# Train the Model with a Progress Bar
trainer.train()


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.706055,0.5
2,No log,0.787354,0.0


TrainOutput(global_step=2, training_loss=0.1769561767578125, metrics={'train_runtime': 7.7626, 'train_samples_per_second': 3.092, 'train_steps_per_second': 0.386, 'total_flos': 264934797312.0, 'train_loss': 0.1769561767578125, 'epoch': 2.0})

In [5]:
# Evaluate the Model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save the Model
model.save_pretrained("./error_detection_model")
tokenizer.save_pretrained("./error_detection_model")

Evaluation Results: {'eval_loss': 0.7060546875, 'eval_accuracy': 0.5, 'eval_runtime': 0.0083, 'eval_samples_per_second': 241.747, 'eval_steps_per_second': 120.873, 'epoch': 2.0}


('./error_detection_model/tokenizer_config.json',
 './error_detection_model/special_tokens_map.json',
 './error_detection_model/vocab.txt',
 './error_detection_model/added_tokens.json',
 './error_detection_model/tokenizer.json')

In [6]:
# Make Predictions with the Trained Model
# Load the trained model and tokenizer for inference
model = DistilBertForSequenceClassification.from_pretrained("./error_detection_model").to(device)
tokenizer = AutoTokenizer.from_pretrained("./error_detection_model")

# Predict for a new sentence
sentence = "उसकी प्रतिभा की गहराई किसी अनजाने समुद्र जैसा है"
inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=64).to(device)

# Perform inference
with torch.no_grad():  # Disable gradient calculations for faster inference
    outputs = model(**inputs)

# Get prediction
prediction = torch.argmax(outputs.logits, dim=-1).item()
if prediction == 1:
    print("Sentence contains errors.")
else:
    print("Sentence is error-free.")


Sentence is error-free.


In [7]:
def read_sentences_and_labels2(src_path, tgt_path, limit=20000000):  # Limit to 20,000 for reduced training time
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]
    sentences = incorrect_sentences[:limit] + correct_sentences[:limit]
    labels = [1] * min(len(incorrect_sentences), limit) + [0] * min(len(correct_sentences), limit)
    return sentences, labels

In [8]:
# Load sentences and labels (limited subset)
src_path = "wikiExtractsData/data/train_merge.src"
tgt_path = "wikiExtractsData/data/train_merge.tgt"
sentences, labels = read_sentences_and_labels2(src_path, tgt_path)


Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3441489.78it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3369125.86it/s]


In [9]:
import random
numbers = random.sample(range(2607707, 2607857), k=20)
newsentences = []
newlabels = []
for i in numbers:
    if labels[i]!=0:
        newsentences.append(sentences[i])
        newlabels.append(labels[i])

newsentences, newlabels

(['मधु ने एक महल बनाया और उस स्थान की नाम मधुपुरी ( संभवतः मथुरा अब ) रखा .',
  'किसी एक उल्लिखित क्षेत्र में पाई जाने वाली भाषा संबंधी विशेषताओं का व्यवस्थित अध्ययन भाषा भूगोल या बोली भूगोल की अंतर्गत आता है .',
  '" अक्सर पूछे जाने वाली प्रश्न " लॉन्च किया , .',
  'यह जरूरी नहीं है कि सहयोग के लिये नेतृत्व का आवश्यक होता है .',
  'स्कूल के पूर्व छात्रों को आर्केशियन कहा जाता हैं .',
  'अभिनेता सनी देओल को विशेष ज्युरी अवार्ड के तौर पर राष्ट्रीय फिल्म पुरस्कार की ओर से सर्वश्रेष्ठ अभिनेता का खिताब नवाजी गया .',
  'महाभारत के युद्ध में काशिराज ने पांडवों के साथ दिया था .',
  '१९०१ में कोटद्वार को नगर का दर्जा दिया गया था , तथा उसी वर्ष हुई प्रथम जनगणना में नगर का जनसंख्या १०२९ थी .'],
 [1, 1, 1, 1, 1, 1, 1, 1])

In [10]:
# Make Predictions with the Trained Model
# Load the trained model and tokenizer for inference
model = DistilBertForSequenceClassification.from_pretrained("./error_detection_model").to(device)
tokenizer = AutoTokenizer.from_pretrained("./error_detection_model")

# Predict for a new sentence
for i in newsentences:
    sentence =i
    inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=64).to(device)

    # Perform inference
    with torch.no_grad():  # Disable gradient calculations for faster inference
        outputs = model(**inputs)

    # Get prediction
    prediction = torch.argmax(outputs.logits, dim=-1).item()
    if prediction == 1:
        print("Sentence contains errors.")
    else:
        print("Sentence is error-free.")

Sentence is error-free.
Sentence is error-free.
Sentence contains errors.
Sentence is error-free.
Sentence is error-free.
Sentence is error-free.
Sentence is error-free.
Sentence is error-free.


In [11]:
def read_sentences_and_labels3(src_file, tgt_file, limit = 10):
    with open(src_file,'r', encoding='utf-8') as src, open(tgt_file,'r', encoding = 'utf-8') as tgt:
        IncorrectSentences = [line.strip() for line in src.readlines()]
        CorrectSentences = [line.strip() for line in tgt.readlines()]
        print(IncorrectSentences[:limit])
        print(CorrectSentences[:limit])

In [12]:

src_path = "wikiExtractsData/data/train_merge.src"
tgt_path = "wikiExtractsData/data/train_merge.tgt"
read_sentences_and_labels3(src_path, tgt_path)

['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा थी .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा हैं , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .', 'डीएनए क्षति और उत्परिवर्तन के बीच अंतर करना अत्यंत महत्वपूर्ण हैं .', 'यह खाना बनाने के काम आती है .', 'फ़िल्म का एल्बम अधिकार ज़ी म्यूजिक कंपनी द्वारा अधिगृहीत किए गए थे , और एल्बम को ११ मार्च २०१७ को रिलीज़ किया गया था .', 'यहां पहाड़ों के मध्य फैली झील के आसपास किसी गेस्ट हाउस में रुक प्रकृति का मजा उठा सकतीं हैं .', 'रामनाथ उसकी ईमानदारी देख कर खुश हो जाता है और अपने कार्यालय में उसे क्लर्क की नौकरी दे देतीं है .', 'कई दिनों के बाद एक सत्रह वर्षीय केबिन कर्मचारी भूख और समुद्री पानी पी लेनी की वजह से बेहोश हो जाता है .', 'इनकी रहन सहन बहुत ही साधारण था .', 'हैदराबाद , मई 2007 हैदराबाद का मक्का मस्जिद में विस्फोट में 11 की मौत .']
['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा था .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की प

In [13]:
from difflib import SequenceMatcher
import torch
from transformers import AutoTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Example function to generate token-level labels based on incorrect and correct sentence pairs
def generate_token_labels(incorrect_sentence, correct_sentence):
    # Tokenize sentences
    incorrect_tokens = incorrect_sentence.split()
    correct_tokens = correct_sentence.split()
    
    # Use SequenceMatcher to find matching and differing blocks
    matcher = SequenceMatcher(None, incorrect_tokens, correct_tokens)
    labels = [0] * len(incorrect_tokens)  # Initialize all labels as correct (0)

    # Mark differing tokens as incorrect
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag in ("replace", "delete"):
            for i in range(i1, i2):
                labels[i] = 1  # Mark as incorrect

    return labels

# Read sentences and generate token-level labels
def read_sentences_and_labels3(src_path, tgt_path, limit=2):
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]

    # Limit dataset size for faster processing
    incorrect_sentences = incorrect_sentences[:limit]
    correct_sentences = correct_sentences[:limit]
    print(correct_sentences)
    print(incorrect_sentences)
    
    sentences = []
    labels = []

    # Generate token-level labels for each sentence pair
    for incorrect, correct in tqdm(zip(incorrect_sentences, correct_sentences), desc="Generating Labels", total=len(incorrect_sentences)):
        sentences.append(incorrect)
        token_labels = generate_token_labels(incorrect, correct)
        labels.append(token_labels)

    return sentences, labels

# Load sentences and labels (limited subset)
src_path = "wikiExtractsData/data/train_merge.src"
tgt_path = "wikiExtractsData/data/train_merge.tgt"
sentences, labels = read_sentences_and_labels3(src_path, tgt_path)

Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3396297.64it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3524000.44it/s]


['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा था .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा है , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .']
['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा थी .', 'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा हैं , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .']


Generating Labels: 100%|██████████| 2/2 [00:00<00:00, 9157.87it/s]


In [14]:
sentences, labels

(['तब राजा को आभास हुआ कि ब्राह्मण और कोई नहीं बल्कि देवों का वास्तुकार विश्वकर्मा थी .',
  'अनेक समुदायों में देह को नदी में प्रवाहित करने की परंपरा हैं , ताकि पानी में रहने वाले विभिन्न जीवों को आहार उपलब्ध हो सके .'],
 [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
  [0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   1,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0,
   0]])

In [15]:
from difflib import SequenceMatcher
import torch
from transformers import AutoTokenizer, BertForTokenClassification, Trainer, TrainingArguments
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
from sklearn.metrics import precision_recall_fscore_support

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Example function to generate token-level labels based on incorrect and correct sentence pairs
def generate_token_labels(incorrect_sentence, correct_sentence):
    # Tokenize sentences
    incorrect_tokens = incorrect_sentence.split()
    correct_tokens = correct_sentence.split()
    
    # Use SequenceMatcher to find matching and differing blocks
    matcher = SequenceMatcher(None, incorrect_tokens, correct_tokens)
    labels = [0] * len(incorrect_tokens)  # Initialize all labels as correct (0)

    # Mark differing tokens as incorrect
    for tag, i1, i2, j1, j2 in matcher.get_opcodes():
        if tag in ("replace", "delete"):
            for i in range(i1, i2):
                labels[i] = 1  # Mark as incorrect

    return labels

# Read sentences and generate token-level labels
def read_sentences_and_labels(src_path, tgt_path, limit=3000000):
    with open(src_path, "r", encoding="utf-8") as src_file, open(tgt_path, "r", encoding="utf-8") as tgt_file:
        incorrect_sentences = [line.strip() for line in tqdm(src_file.readlines(), desc="Reading Incorrect Sentences")]
        correct_sentences = [line.strip() for line in tqdm(tgt_file.readlines(), desc="Reading Correct Sentences")]

    # Limit dataset size for faster processing
    incorrect_sentences = incorrect_sentences[:limit]
    correct_sentences = correct_sentences[:limit]

    sentences = []
    labels = []

    # Generate token-level labels for each sentence pair
    for incorrect, correct in tqdm(zip(incorrect_sentences, correct_sentences), desc="Generating Labels", total=len(incorrect_sentences)):
        sentences.append(incorrect)
        token_labels = generate_token_labels(incorrect, correct)
        labels.append(token_labels)

    return sentences, labels

# Load sentences and labels (limited subset)
src_path = "wikiExtractsData/data/train_merge.src"
tgt_path = "wikiExtractsData/data/train_merge.tgt"
sentences, labels = read_sentences_and_labels(src_path, tgt_path)

# Tokenize and prepare dataset
class TokenClassificationDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=64):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        # Tokenize the sentence and align labels with tokens
        sentence = self.sentences[idx]
        label = self.labels[idx]

        # Tokenize with padding and truncation
        encoding = self.tokenizer(
            sentence,
            padding="max_length",
            truncation=True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        # Get input ids and attention mask
        input_ids = encoding["input_ids"].squeeze(0)
        attention_mask = encoding["attention_mask"].squeeze(0)

        # Align the labels with tokens
        word_ids = encoding.word_ids(batch_index=0)  # Map word_ids to tokens

        labels_aligned = []
        for word_id in word_ids:
            if word_id is None:
                labels_aligned.append(-100)  # Ignore these tokens (e.g., padding tokens)
            else:
                labels_aligned.append(label[word_id] if word_id < len(label) else 0)
        
        labels_aligned = torch.tensor(labels_aligned)

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels_aligned
        }

# Create the dataset
dataset = TokenClassificationDataset(sentences, labels, tokenizer)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Load pre-trained BERT model for token classification
model = BertForTokenClassification.from_pretrained(
    "bert-base-multilingual-cased",
    num_labels=2  # Binary classification: correct or incorrect
)

training_args = TrainingArguments(
    output_dir="./token_classification_results",
    evaluation_strategy="epoch",    # Evaluate at the end of each epoch
    save_strategy="epoch",          # Save at the end of each epoch
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=50,
    save_total_limit=1,
    load_best_model_at_end=True,
    fp16=True
)


# Define compute_metrics function
def compute_metrics(pred):
    labels = pred.label_ids.flatten()
    preds = torch.argmax(torch.tensor(pred.predictions), axis=-1).flatten()

    # Filter out -100 labels (ignored labels for padding, etc.)
    mask = labels != -100
    labels = labels[mask]
    preds = preds[mask]

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    return {
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()
print("Evaluation Results:", results)

# Save the model
model.save_pretrained("./token_error_detection_model")
tokenizer.save_pretrained("./token_error_detection_model")

# Make predictions for new sentence
sentence = "यह एक गलत वाक्य है।"
inputs = tokenizer(sentence, return_tensors="pt", padding="max_length", truncation=True, max_length=64).to("cuda" if torch.cuda.is_available() else "cpu")

# Move model to CUDA if available
model.to("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, axis=-1)

# Map predictions back to tokens
tokens = tokenizer.tokenize(sentence)
for token, prediction in zip(tokens, predictions[0][:len(tokens)]):
    status = "Incorrect" if prediction == 1 else "Correct"
    print(f"Token: {token}, Status: {status}")


Reading Incorrect Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3390690.16it/s]
Reading Correct Sentences: 100%|██████████| 2607757/2607757 [00:00<00:00, 3512584.87it/s]
Generating Labels: 100%|██████████| 2607757/2607757 [01:20<00:00, 32475.84it/s]
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kern

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 