In [None]:
import numpy as np
from collections import defaultdict
import Levenshtein
import re
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import os

def load_nepali_words():
    """Load Nepali words from the corpus file."""
    with open("data/nepali_corpus.txt", encoding='utf-8') as f:
        words = set(line.strip() for line in f if line.strip())
    return words

def create_character_mapping(words):
    """Create character to index mapping for the model."""
    chars = set()
    for word in words:
        chars.update(word)
    char_to_idx = {char: idx + 1 for idx, char in enumerate(sorted(chars))}
    char_to_idx['<PAD>'] = 0
    idx_to_char = {idx: char for char, idx in char_to_idx.items()}
    return char_to_idx, idx_to_char

def prepare_training_data(words, char_to_idx, max_len=20):
    """Prepare training data for the model."""
    X = []
    y = []
    
    for word in words:
        # Create sequences for each word
        for i in range(1, len(word)):
            input_seq = word[:i]
            target_char = word[i]
            
            # Convert to indices
            input_indices = [char_to_idx[char] for char in input_seq]
            target_index = char_to_idx[target_char]
            
            X.append(input_indices)
            y.append(target_index)
    
    # Pad sequences
    X = pad_sequences(X, maxlen=max_len, padding='pre')
    y = tf.keras.utils.to_categorical(y, num_classes=len(char_to_idx))
    
    return X, y

def build_model(vocab_size, max_len):
    """Build the LSTM model."""
    model = Sequential([
        Embedding(vocab_size, 64, input_length=max_len),
        LSTM(128, return_sequences=True),
        LSTM(64),
        Dense(vocab_size, activation='softmax')
    ])
    
    model.compile(loss='categorical_crossentropy', 
                 optimizer='adam',
                 metrics=['accuracy'])
    return model

def train_model(words, epochs=50, batch_size=32):
    """Train the word suggestion model."""
    print("Preparing data for training...")
    char_to_idx, idx_to_char = create_character_mapping(words)
    X, y = prepare_training_data(words, char_to_idx)
    
    print("Building and training model...")
    model = build_model(len(char_to_idx), X.shape[1])
    
    # Train the model
    history = model.fit(X, y, 
                       epochs=epochs, 
                       batch_size=batch_size,
                       verbose=1)
    
    # Save the model and mappings
    model.save('nepali_word_model.h5')
    with open('char_mappings.pkl', 'wb') as f:
        pickle.dump((char_to_idx, idx_to_char), f)
    
    return model, char_to_idx, idx_to_char

def load_trained_model():
    """Load the trained model and character mappings."""
    if not os.path.exists('nepali_word_model.h5') or not os.path.exists('char_mappings.pkl'):
        return None, None, None
    
    model = tf.keras.models.load_model('nepali_word_model.h5')
    with open('char_mappings.pkl', 'rb') as f:
        char_to_idx, idx_to_char = pickle.load(f)
    return model, char_to_idx, idx_to_char

def get_ai_suggestions(partial_word, model, char_to_idx, idx_to_char, max_len=20, num_suggestions=5):
    """Get AI-based word suggestions."""
    if not partial_word:
        return []
    
    # Convert input to indices
    input_seq = [char_to_idx.get(char, 0) for char in partial_word]
    input_seq = pad_sequences([input_seq], maxlen=max_len, padding='pre')
    
    # Generate suggestions
    suggestions = set()
    current_word = partial_word
    
    for _ in range(10):  # Try to generate 10 different suggestions
        if len(suggestions) >= num_suggestions:
            break
            
        # Predict next character
        pred = model.predict(input_seq, verbose=0)
        next_char_idx = np.argmax(pred[0])
        next_char = idx_to_char[next_char_idx]
        
        # Add to current word
        current_word += next_char
        
        # If we've generated a complete word, add it to suggestions
        if current_word in words:
            suggestions.add(current_word)
        
        # Update input sequence for next prediction
        input_seq = [char_to_idx.get(char, 0) for char in current_word]
        input_seq = pad_sequences([input_seq], maxlen=max_len, padding='pre')
    
    return list(suggestions)[:num_suggestions]

def get_word_completions(partial_word, word_list, max_suggestions=5):
    """Get word completions for a partial Nepali word."""
    completions = [word for word in word_list if word.startswith(partial_word)]
    return sorted(completions)[:max_suggestions]

def get_spelling_suggestions(word, word_list, max_distance=2, max_suggestions=5):
    """Get spelling suggestions for a potentially misspelled word."""
    suggestions = []
    for dict_word in word_list:
        distance = Levenshtein.distance(word, dict_word)
        if distance <= max_distance:
            suggestions.append((dict_word, distance))
    
    # Sort by distance and return top suggestions
    return [word for word, _ in sorted(suggestions, key=lambda x: x[1])[:max_suggestions]]

def get_suggestions(input_text, words, model=None, char_to_idx=None, idx_to_char=None):
    """Get both completions and spelling suggestions for input text."""
    # Get traditional suggestions
    completions = get_word_completions(input_text, words)
    spelling_suggestions = get_spelling_suggestions(input_text, words)
    
    # Get AI-based suggestions if model is available
    ai_suggestions = []
    if model is not None and char_to_idx is not None and idx_to_char is not None:
        ai_suggestions = get_ai_suggestions(input_text, model, char_to_idx, idx_to_char)
    
    # Combine all suggestions
    all_suggestions = list(set(completions + spelling_suggestions + ai_suggestions))
    return all_suggestions[:5]  # Return top 5 suggestions

def interactive_demo():
    """Run an interactive demo of the suggestion system."""
    print("Loading Nepali words...")
    words = load_nepali_words()
    print(f"Loaded {len(words)} unique words")
    
    # Load or train the model
    model, char_to_idx, idx_to_char = load_trained_model()
    if model is None:
        print("Training new model...")
        model, char_to_idx, idx_to_char = train_model(words)
    
    while True:
        user_input = input("\nEnter a Nepali word (or 'quit' to exit): ")
        if user_input.lower() == 'quit':
            break
        suggestions = get_suggestions(user_input, words, model, char_to_idx, idx_to_char)
        print(f"\nSuggestions for '{user_input}':")
        for i, suggestion in enumerate(suggestions, 1):
            print(f"{i}. {suggestion}")

def test_examples():
    """Run some test examples."""
    print("Loading Nepali words...")
    words = load_nepali_words()
    print(f"Loaded {len(words)} unique words")
    
    # Load or train the model
    model, char_to_idx, idx_to_char = load_trained_model()
    if model is None:
        print("Training new model...")
        model, char_to_idx, idx_to_char = train_model(words)
    
    test_inputs = ["ति", "नमस", "काठ", "नेपा", "भाष"]
    for input_text in test_inputs:
        suggestions = get_suggestions(input_text, words, model, char_to_idx, idx_to_char)
        print(f"\nInput: {input_text}")
        print("Suggestions:", suggestions)

if __name__ == "__main__":
    print("Nepali Word Completion and Spelling Correction System")
    print("1. Run test examples")
    print("2. Start interactive demo")
    choice = input("Enter your choice (1 or 2): ")
    
    if choice == "1":
        test_examples()
    elif choice == "2":
        interactive_demo()
    else:
        print("Invalid choice!") 