In [None]:
!pip install stanza

import stanza
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor

# Initialize Stanza pipeline with GPU
stanza.download("hi")
nlp = stanza.Pipeline("hi", batch_size=1024, processors="tokenize,pos", use_gpu=True)

def generate_number_agreement_errors(doc):
    """
    Generate number agreement errors for a processed Stanza document.
    Args:
        doc: Stanza processed sentence (doc.sentences[0]).
    Returns:
        str: Modified sentence with errors.
    """
    modified_sentence = []

    for word in doc.words:
        if word.upos == "NOUN":
            # Singular to plural or plural to singular transformation
            if word.text.endswith("ा"):  # Singular noun
                modified_sentence.append(word.text[:-1] + "े")  # Replace ending
            elif word.text.endswith("े"):  # Plural noun
                modified_sentence.append(word.text[:-1] + "ा")
            else:
                modified_sentence.append(word.text)
        elif word.upos == "ADJ":
            # Adjective Agreement: Match the transformed noun
            if word.text.endswith("ा"):
                modified_sentence.append(word.text[:-1] + "े")
            elif word.text.endswith("े"):
                modified_sentence.append(word.text[:-1] + "ा")
            else:
                modified_sentence.append(word.text)
        elif word.upos == "AUX":
            # Singular to plural or plural to singular auxiliary verbs
            if word.text == "हैं":  # Plural auxiliary
                modified_sentence.append("है")  # Plural to singular
            elif word.text == "है":  # Singular auxiliary
                modified_sentence.append("हैं")  # Singular to plural
            else:
                modified_sentence.append(word.text)
        elif word.upos == "PRON":
            # Pronoun Agreement: Singular to plural or vice versa
            if word.text == "उसका":  # Singular pronoun
                modified_sentence.append("उनका")  # Singular to plural
            elif word.text == "उनका":  # Plural pronoun
                modified_sentence.append("उसका")  # Plural to singular
            elif word.text == "अपना":  # Reflexive pronoun
                modified_sentence.append("अपने")
            elif word.text == "अपने":
                modified_sentence.append("अपना")
            else:
                modified_sentence.append(word.text)
        else:
            # Keep other tokens unchanged
            modified_sentence.append(word.text)

    return " ".join(modified_sentence)


def generate_case_marker_errors(doc):
    """
    Generate case-marker errors for a processed Stanza document.
    Args:
        doc: Stanza processed sentence (doc.sentences[0]).
    Returns:
        str: Modified sentence with case-marker errors.
    """
    modified_sentence = []

    for word in doc.words:
        # Identify nouns and modify associated case markers
        if word.upos == "ADP":  # ADP (Adposition) includes case markers
            if word.text == "ने":
                modified_sentence.append("को")  # Replace 'ने' with 'को'
            elif word.text == "को":
                modified_sentence.append("से")  # Replace 'को' with 'से'
            elif word.text == "से":
                modified_sentence.append("का")  # Replace 'से' with 'का'
            elif word.text in ["का", "की", "के"]:
                modified_sentence.append("ने")  # Replace possessives with 'ने'
            else:
                modified_sentence.append(word.text)  # Keep other markers unchanged
        else:
            modified_sentence.append(word.text)

    return " ".join(modified_sentence)


def process_batch(documents, error_type='number_agreement'):
    """
    Process a batch of Stanza documents to generate errors.
    Args:
        documents: List of Stanza processed sentences.
        error_type: Type of error to generate ('number_agreement' or 'case_marker').
    Returns:
        List of modified sentences with errors.
    """
    results = []
    with ProcessPoolExecutor() as executor:
        if error_type == 'number_agreement':
            results = list(executor.map(generate_number_agreement_errors, documents))
        elif error_type == 'case_marker':
            results = list(executor.map(generate_case_marker_errors, documents))
    return results


def process_src_file_with_pooling(input_file, output_file, batch_size=2048, error_type='number_agreement'):
    """
    Process a .src file to generate errors and save to a new file using pooling.
    Args:
        input_file (str): Path to the input .src file.
        output_file (str): Path to the output .src file.
        batch_size (int): Number of sentences per batch.
        error_type (str): Type of error to generate ('number_agreement' or 'case_marker').
    """
    # Read all sentences from the input .src file
    with open(input_file, "r", encoding="utf-8") as f:
        sentences = [line.strip() for line in f]

    # Initialize output
    modified_sentences = []

    # Process sentences in batches
    for i in tqdm(range(0, len(sentences), batch_size), desc="Processing Batches"):
        batch = sentences[i:i + batch_size]

        # Use Stanza to process the batch
        doc = nlp("\n".join(batch))

        # Collect sentences from the processed doc
        documents = doc.sentences

        # Generate errors using pooling
        batch_modified_sentences = process_batch(documents, error_type=error_type)

        # Write to output in chunks to avoid memory overflow
        with open(output_file, "a", encoding="utf-8") as out_f:
            out_f.writelines([line + "\n" for line in batch_modified_sentences])

    print(f"Processed {len(sentences)} sentences. Output saved to {output_file}")


In [None]:
input_file = "wikiExtractsData/data/train_merge.src"
output_file = "train_data_with_case_errors.src"

# process_src_file_with_pooling(input_file, output_file, batch_size=64, error_type='number_agreement')

output_file2 = "train_data_case_marker_errors.src"

process_src_file_with_pooling(input_file, output_file2, batch_size=64, error_type='case_marker')


# process_src_file_with_case_marker_errors(input_file, output_file, batch_size=1)


In [None]:
# Specify file paths
input_file = "wikiExtractsData/data/train_merge.src"  # Path to your input .src file
output_file = "train_data_with_errors.src"  # Path to save the modified .src file

# Process the file
process_src_file_optimized(input_file, output_file, batch_size=1024)
