In [3]:
import gensim
import inflect
import random
import numpy as np

In [4]:
model = gensim.models.KeyedVectors.load_word2vec_format(
    'GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000
)

In [5]:
with open("./words.txt") as f:
    words = f.readlines()

words = [w.strip() for w in words] 

In [487]:
"""Generates team word lists and a random game board based on the word lists.

:param word_list: Codenames master word list, generated in block above
:return: 5x5 numpy board, red team words, blue team words, neutral words, assassin
:rtype: tuple
"""
def generate_board(word_list):
    used = set()
    red = []
    blue = []
    neutral = []
    assassin = []

    #Generate 9 random words for red team.
    while len(red) < 9:
        index = random.choice(range(len(word_list)))
        word = word_list[index]
        if index not in used:
            red.append(word)
            used.add(index)

    #Generate 8 random words for blue team.
    while len(blue) < 8:
        index = random.choice(range(len(word_list)))
        word = word_list[index]
        if index not in used:
            blue.append(word)
            used.add(index)
    
    #Generate 7 random neutral words.
    while len(neutral) < 7:
        index = random.choice(range(len(word_list)))
        word = word_list[index]
        if index not in used:
            neutral.append(word)
            used.add(index)
    
    #Generate assassin word.
    while not assassin:
        index = random.choice(range(len(word_list)))
        word = word_list[index]
        if index not in used:
            assassin.append(word)
            used.add(index)
    board = red + blue + neutral + assassin
    random.shuffle(board)
    board = np.reshape(board,(5,5))
    return board, red, blue, neutral, assassin

"""Guesses the most similar n words out of given words list based on given clue.

Threshold similarity for guessed words must be greater than 0.1

:param clue: given clue
:param words: given list of words to guess from
:param n: max number of words to guess
:return: list of length at most n of best guesses
"""
def guess(clue, words, n):
    poss = {}
    for w in words:
        poss[w] = model.similarity(clue, w)
    poss_lst = sorted(poss, key=poss.__getitem__, reverse=True)
    top_n = poss_lst[:n]
    return [w for w in top_n if poss[w] > 0.2]

"""Verifies that a clue is valid.

A clue (word2) is invalid if either word is a substring of the other, if there is an underscore
in word2, or if word2 is the plural form of word1. Uses inflect.engine() to check plurality.

:param word1: a word from the codenames list of words
:param word2: a model generated clue to be verified
:return: False if word2 is invalid, True if word2 is valid
"""
def clean_clue(word1, word2):
    engine = inflect.engine()
    word1 = word1.lower()
    word2 = word2.lower()
    return not (word1 in word2 or word2 in word1 or "_" in word2 or word2 == engine.plural(word1))

"""Gives an optimal clue based on current board state.

This function iteratively finds the most correlated two words and generates potential clues related to both.
Then, it checks if any of the bad words are of greater similarity to the max potential clue than either
most correlated words.

:param words: list of words to generate a clue for
:param bad_words: list of words to avoid giving clues for
:return: tuple of optimal clue, the words intended to be guessed
"""
def give_clue(words, bad_words):
            
    # Correlate all possible pairs of words and store them in a dict of (word1,word2): similarity
    if len(words) >= 2:
        similarities = {}
        for i in range(len(words)):
            for j in range(i + 1, len(words)):
                similarities[(words[i], words[j])] = model.similarity(words[i], words[j])
        triple_similarities = {}
    
    if len(words) >= 3:
        seen = set()
        for w in words:
            for key in similarities.keys():
                z = key + (w,)
                if w not in key and tuple(sorted(z)) not in seen:
                    triple_similarities[z] = model.n_similarity([w], list(key))
                    seen.add(tuple(sorted(z)))
    
    # Loop until we find the optimal pair of words to go for. Shouldn't ever be an infinite loop?
    while True:
        # Find highest similarity pair of words
        if len(words) == 1:
            max_correlated_n = (words[0],)
        elif len(words) >= 2:
            max_correlated_pair = max(similarities, key=similarities.get)
            max_correlated_n = max_correlated_pair
        if len(words) >= 3:
            max_correlated_triple = max(triple_similarities, key=triple_similarities.get)
        
            if triple_similarities[max_correlated_triple] * 0.9 >= similarities[max_correlated_pair]:
                max_correlated_n = max_correlated_triple
            else:
                max_correlated_n = max_correlated_pair
        
        c_words = list(max_correlated_n)
        
        # Find most similar words to both words in max_correlated_n
        clues = model.most_similar(positive=c_words,topn=10, restrict_vocab=10000)
        
        # Clean the found similar words
        clues_dict = dict(clues)
        cleaned_clues = [c[0] for c in clues if all([clean_clue(w,c[0]) for w in c_words])]
        
        # Iterate until cleaned_clues is empty or we find an optimal clue
        while cleaned_clues:
            # Find best current clue
            possible_clue = max(cleaned_clues, key=lambda x: clues_dict[x])

            # Find most similar word to best current clue from bad_words
            enemy_match = model.most_similar_to_given(possible_clue, bad_words)
            
            # Calculate similarity between the two
            enemy_sim = model.similarity(enemy_match, possible_clue)
            
            # If enemy's word is greater in similarity than either of the words in max_correlated_pair,
            # remove the best current clue from cleaned_clues and continue iterating.
            # If not, return the current clue, as it is optimal.
            optimal = True
            for n in max_correlated_n:
                if enemy_sim >= model.similarity(n, possible_clue):
                    #print("Foreign word " + enemy_match + " was too close. Removing " + possible_clue)
                    cleaned_clues.remove(possible_clue)
                    optimal = False
                    break
            
            if optimal:
                return possible_clue, tuple(max_correlated_n)
            
        # All the enemy's clues were atleast more similar than one of the words in max_correlated_pair,
        # so pop max_correlated_pair from similarities dict and continue iterating.
        #print("Too many enemy correlations. Removing ", max_correlated_n)
        if len(max_correlated_n) == 2:
            similarities.pop(max_correlated_n) 
        elif len(max_correlated_n) == 3:
            triple_similarities.pop(max_correlated_n)

In [457]:
board, red, blue, neutral, assassin = generate_board(words)
clue, intended = give_clue(red, blue, neutral, assassin)
print("Gave clue", clue, "for", len(intended))
attempt = tuple(guess(clue, red+blue+neutral+assassin, 2))
print("Guessed", attempt)

Gave clue birdie for 2
Guessed ('eagle', 'flute')


In [None]:
#Statistics
correct = 0
for i in range(1000):
    board, red, blue, neutral, assassin = generate_board(words)
    clue, intended = give_clue(red, assassin + blue + neutral)
    attempt = tuple(guess(clue, red+blue+neutral+assassin, 2))
    if intended == attempt or (intended[1], intended[0]) == attempt or intended[1] == attempt[0] or intended[1] == attempt[1] or intended[0] == attempt[0] or intended[0] == attempt[1]:
        correct += 1
print("Guessed: ",correct/100, " correct")

In [488]:
#Doubles/Triples Statistics
triples_success = 0
triples_failure = 0
doubles_success = 0
doubles_failure = 0
for i in range(1000):
    board, red, blue, neutral, assassin = generate_board(words)
    clue, intended = give_clue(red, blue + neutral + assassin)
    attempt = tuple(guess(clue, red+blue+neutral+assassin, len(intended)))
    if len(intended) == 3:
        if set(attempt) == set(intended):
            triples_success +=1
        else:
            triples_failure += 1
    elif len(intended) == 2:
        if set(attempt) == set(intended):
            doubles_success += 1
        else:
            doubles_failure += 1
            
print("Guessed ", doubles_success + doubles_failure, " doubles. Success rate: ", doubles_success / (doubles_success + doubles_failure))
print("Guessed ", triples_success + triples_failure, " triples. Success rate: ", triples_success / (triples_success + triples_failure))

Guessed  747  doubles. Success rate:  0.8755020080321285
Guessed  253  triples. Success rate:  0.6363636363636364


In [474]:
model.most_similar(positive=["leg", "butter"], topn=10)

0.15682536