In [4]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import random

# Step 1: Load the word list
def load_words(filename):
    with open(filename, 'r') as file:
        words = file.read().splitlines()
    return words

# Step 2: Build N-gram models
def build_ngrams(words):
    unigrams = Counter()
    bigrams = defaultdict(Counter)
    trigrams = defaultdict(lambda: defaultdict(Counter))

    for word in words:
        # Adding boundary symbols for word beginning and end
        padded_word = '^^' + word + '$'  # ^ - start, $ - end

        # Unigrams
        for char in word:
            unigrams[char] += 1

        # Bigrams and Trigrams
        for i in range(len(padded_word) - 2):
            bigrams[padded_word[i]][padded_word[i+1]] += 1
            trigrams[padded_word[i]][padded_word[i+1]][padded_word[i+2]] += 1

    return unigrams, bigrams, trigrams

# Step 3: Guesser function using N-grams
def ngram_guesser(mask, guessed, unigrams, bigrams, trigrams):
    # Calculate character probabilities based on unigram, bigram, and trigram counts
    char_probs = defaultdict(float)
    alphabet = 'abcdefghijklmnopqrstuvwxyz'
    for index in range(len(mask)):
        if mask[index] == '_':
            preceding = mask[max(0, index-1)]
            preceding2 = mask[max(0, index-2)]

            # Apply unigram, bigram, and trigram probabilities
            for char in alphabet:
                if char not in guessed:
                    if index > 1 and preceding != '_' and preceding2 != '_':
                        char_probs[char] += trigrams[preceding2][preceding][char]
                    if index > 0 and preceding != '_':
                        char_probs[char] += bigrams[preceding][char]
                    char_probs[char] += unigrams[char]

    # Pick the character with the highest probability that hasn't been guessed
    best_guess = max(char_probs, key=char_probs.get, default=None)
    return best_guess

# Step 4: Hangman game function
def play_hangman(secret_word, unigrams, bigrams, trigrams, max_mistakes=6):
    mask = ['_'] * len(secret_word)
    guessed = set()
    mistakes = 0

    while mistakes < max_mistakes and '_' in mask:
        guess = ngram_guesser(mask, guessed, unigrams, bigrams, trigrams)
        if guess in guessed or guess is None:
            mistakes += 1  # Penalize repeated or invalid guesses
            continue

        guessed.add(guess)
        if guess in secret_word:
            for i, char in enumerate(secret_word):
                if char == guess:
                    mask[i] = char
        else:
            mistakes += 1

    return ''.join(mask) == secret_word

# Step 5: Evaluate model accuracy
def evaluate_accuracy(words, unigrams, bigrams, trigrams, max_tries, num_tests=100):
    random_sample = random.sample(words, num_tests)
    success_count = sum(play_hangman(word, unigrams, bigrams, trigrams, max_tries) for word in random_sample)
    accuracy = success_count / num_tests
    return accuracy

# Example usage
words = load_words('words_corpus.txt')
unigrams, bigrams, trigrams = build_ngrams(words)
accuracy_6_tries = evaluate_accuracy(words, unigrams, bigrams, trigrams, 6)
accuracy_10_tries = evaluate_accuracy(words, unigrams, bigrams, trigrams, 10)

print(f"Accuracy with 6 tries: {accuracy_6_tries:.2f}")
print(f"Accuracy with 10 tries: {accuracy_10_tries:.2f}")


Accuracy with 6 tries: 0.23
Accuracy with 10 tries: 0.56
