In [10]:
import csv

# Define expanded POS rules
POS_RULES = {
    "PRON": ["তেওঁ", "তুমি", "মই", "আমি", "আপুনি", "তেওঁলোকে", "তোৰ", "মোৰ", "আপোনাৰ"],
    "VERB": ["পঢ়ি", "খেলে", "গলো", "লিখি", "গৈছে", "আছে", "হল", "থাকিব", "চাইছা", "কৰা"],
    "NOUN": ["কিতাপ", "স্কুল", "ঘৰ", "মানুহ", "পাঠশালা", "শিক্ষক", "পঢ়া", "বাৰ্তাৰ", "পথ"],
    "AUX": ["থাকে", "হয়", "ছিল", "হ’ব", "নাই", "কৰিছিল", "কৰে", "হ’বলৈ", "থাকিব"],
    "ADJ": ["ধুনীয়া", "সুন্দৰ", "ডাঙৰ", "সৰু", "লম্বা", "গভীৰ", "দীঘল", "চকুতলগা"],
    "ADV": ["বেছি", "দ্রুত", "ধীৰে", "সৰ্বদা", "পিছত", "আগতে", "খুব", "অলপ", "আলফুলে"],
    "CONJ": ["আৰু", "কিন্তু", "অথবা", "যদি", "যেনেকৈ", "যেতিয়া", "যিহেতু"],
    "DET": ["এই", "সেই", "যি", "সকল", "কোনো", "প্ৰত্যেক", "তেনে", "যিখন"],
    "NUM": ["এটা", "দুটি", "তিনিটা", "এজন", "দশটা", "পাঁচটি", "হাজাৰ", "লাখ", "কিমান"],
    "PUNCT": [".", "।", ",", "?", "!", ";", ":", "-", "—", "..."],
    "PART": ["তো", "নে", "বাহ", "ওহো", "হাঁ", "না", "আহা", "হায়", "কি", "যেন", "বলি"],
    "POSTP": ["লগে", "পাছত", "বাবে", "আগতে", "পৰা", "লাগি", "খাতিৰত", "পৰা", "ৰ পৰা"],
    "INTJ": ["আহা", "অহো", "বাহ", "হাঁ", "নহ", "ধন্যবাদ", "আঁ", "অ", "ওহো", "হায়"],
    "UNK": []  
}

def rule_based_pos_tagging(sentence):
    """
    Tags each word in the sentence based on predefined POS rules.
    Falls back to suffix-based tagging if no match is found.
    """
    words = sentence.split()
    pos_tags = []
    
    for word in words:
        tagged = False
        for pos, lexicon in POS_RULES.items():
            if word in lexicon:
                pos_tags.append((word, pos))
                tagged = True
                break
        if not tagged:
            tag = suffix_based_tagging(word)
            pos_tags.append((word, tag if tag else "UNK"))
    
    return pos_tags

def suffix_based_tagging(word):
    """
    Infers POS tag for a word based on common Assamese suffixes.
    """
    if word.endswith("ত") or word.endswith("ৰ") or word.endswith("ৰ পৰা"):
        return "NOUN"
    elif word.endswith("ি") or word.endswith("া") or word.endswith("ল"):
        return "VERB"
    elif word.endswith("তকৈ") or word.endswith("খিনি"):
        return "ADV"
    elif word.endswith("ডাঙৰ") or word.endswith("সুন্দৰ"):
        return "ADJ"
    return None

def tag_dataset(input_file, output_file):
    """
    Reads sentences from a CSV file, applies POS tagging, 
    and saves results to an output file.
    """
    with open(input_file, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        next(reader)  
        data = [row[0] for row in reader]  

    tagged_data = []
    for idx, sentence in enumerate(data):
        tagged_sentence = rule_based_pos_tagging(sentence)
        for word, tag in tagged_sentence:
            tagged_data.append([idx, sentence, word, tag])

    with open(output_file, mode='w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Sentence Index", "Sentence", "Word", "POS Tag"])
        writer.writerows(tagged_data)
    print(f"POS tagging completed. Results saved to {output_file}.")

if __name__ == "__main__":
    input_csv = "/kaggle/input/correct-sentences/correct_sentences.csv"
    output_csv = "/kaggle/working/tagged_sentences.csv"  
    tag_dataset(input_csv, output_csv)


POS tagging completed. Results saved to /kaggle/working/tagged_sentences.csv.
