In [12]:
import csv
import random

POS_TAG_ERRORS = {
    "PRON": {
        "মই": ["তুমি", "তেওঁ", "আপুনি", "তোমাৰ", "তেওঁলোকে"],
        "তুমি": ["মই", "আপুনি", "তেওঁ", "তোমাৰ", "তেওঁলোকে"],
        "তেওঁ": ["মই", "তুমি", "আপুনি", "তেওঁলোকে", "তোৰ"]
    },
    "VERB": {
        "পঢ়ি": ["পঢ়িছিল", "পঢ়িল", "পঢ়ি থাকে", "পঢ়িছো", "পঢ়ি যাব"],
        "গলো": ["গল", "গলৰ", "গলোঁ", "গ’ল", "গলাইছিল"],
        "থাকে": ["ছিল", "থাকিছিল", "থাকিল", "থাকিব", "থাকে", "থাকিবলগীয়া"],
        "লিখি": ["লিখিছিল", "লিখো", "লিখিব", "লিখিছিল", "লিখি গৈছিল"],
        "খেলে": ["খেলি", "খেলোঁ", "খেলিব", "খেলিছো", "খেলিছিল"]
    },
    "NOUN": {
        "কিতাপ": ["কিতাপবোৰ", "কিতাপৰ", "কিতাপটি", "কিতাপৰ পৰা", "কিতাপৰ পৰা"],
        "স্কুল": ["স্কুলবোৰ", "স্কুলৰ", "স্কুলৰ পৰা", "স্কুলৰ পৰা"],
        "মানুহ": ["মানুহবোৰ", "মানুহৰ", "মানুহৰ পৰা", "মানুহৰ পৰা"],
        "পাঠশালা": ["পাঠশালাবোৰ", "পাঠশালাৰ", "পাঠশালাৰ পৰা"],
    },
    "AUX": {
        "হয়": ["ছিল", "থাকিল", "হ’ব", "হ’বলৈ", "থাকিব", "হ’ব"]
    },
    "ADJ": {
        "ধুনীয়া": ["সুন্দৰ", "ডাঙৰ", "নতুন", "মাধুৰ", "চকুতলগা"],
        "সুন্দৰ": ["ধুনীয়া", "ডাঙৰ", "লম্বা", "মুখৰ পৰা"],
        "ডাঙৰ": ["সৰু", "লম্বা", "হালধীয়া", "বাচল"],
    },
    "ADV": {
        "বেছি": ["খুব", "অলপ", "ধীৰে", "সৰ্বদা", "অল্প", "পিছত"],
        "দ্রুত": ["ধীৰে", "সৰ্বদা", "বাহ", "খুব", "অলপ"],
    },
    "PUNCT": {  
        "।": [".", "!", "?", ";", ":"]
    },
    "UNK": [] 
}

def insert_errors(sentence, pos_tags):
    """
    Introduces errors into a sentence based on its POS tags.
    For each word, the function will modify it according to the predefined errors in POS_TAG_ERRORS.
    """
    words = sentence.split()
    incorrect_sentence = []
    
    for word, pos_tag in pos_tags:
        if pos_tag in POS_TAG_ERRORS and word in POS_TAG_ERRORS[pos_tag]:
            incorrect_word = random.choice(POS_TAG_ERRORS[pos_tag][word])
        else:
            incorrect_word = word  
            
        incorrect_sentence.append(incorrect_word)
    
    incorrect_sentence = " ".join(incorrect_sentence)
    return incorrect_sentence

def rule_based_pos_tagging(sentence):
    """
    Tags each word in the sentence based on predefined POS rules.
    """
    words = sentence.split()
    pos_tags = []
    
    for word in words:
        tagged = False
        for pos, lexicon in POS_RULES.items():
            if word in lexicon:
                pos_tags.append((word, pos))
                tagged = True
                break
        if not tagged:
            tag = suffix_based_tagging(word)
            pos_tags.append((word, tag if tag else "UNK"))
    
    return pos_tags

def tag_and_insert_errors(input_file, output_file):
    """
    Reads the dataset from a CSV, applies POS tagging and introduces errors into the sentences,
    and saves the results to an output file with columns: Sentence Index, Correct Sentence, 
    Incorrect Sentence, Word, POS Tag
    """

    with open(input_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        data = [row for row in reader]

    tagged_data = []
    for entry in data:
        sentence_index = entry["Sentence Index"]
        sentence = entry["Sentence"]
        
        pos_tags = rule_based_pos_tagging(sentence)
        
        incorrect_sentence = insert_errors(sentence, pos_tags)
        
        for word, pos_tag in pos_tags:
            tagged_data.append({
                "Sentence Index": sentence_index,
                "Sentence": sentence,
                "Incorrect Sentence": incorrect_sentence,
                "Word": word,
                "POS Tag": pos_tag
            })
    
    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ["Sentence Index", "Sentence", "Incorrect Sentence", "Word", "POS Tag"]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(tagged_data)
    
    print(f"Dataset with errors saved to {output_file}.")

if __name__ == "__main__":
    input_csv = "/kaggle/input/tagged/tagged_sentences.csv" 
    output_csv = "/kaggle/working/preprocessed_dataset.csv"
    tag_and_insert_errors(input_csv, output_csv)


Dataset with errors saved to /kaggle/working/final_dataset.csv.
