In [13]:
import numpy as np
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
import heapq
import nltk
from nltk.corpus import stopwords
import itertools
%matplotlib inline

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/tylorn/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [4]:
english_stopwords = set(stopwords.words('english'))

In [5]:
dataset = fetch_20newsgroups(
    remove=('headers', 'footers', 'quotes')
)

In [10]:
pured_documents = []
for i, doc in enumerate(dataset.data):
    tokens = gensim.utils.lemmatize(doc)
    document = []
    for token in tokens:
        word = token.split('/')[0]
        if word not in english_stopwords:
            document.append(word)
    pured_documents.append(document)    
    if i % 500 == 0:
        print 'Processed: ', i, 'documents from', len(dataset.data)

Processed:  0 documents from 11314
Processed:  500 documents from 11314
Processed:  1000 documents from 11314
Processed:  1500 documents from 11314
Processed:  2000 documents from 11314
Processed:  2500 documents from 11314
Processed:  3000 documents from 11314
Processed:  3500 documents from 11314
Processed:  4000 documents from 11314
Processed:  4500 documents from 11314
Processed:  5000 documents from 11314
Processed:  5500 documents from 11314
Processed:  6000 documents from 11314
Processed:  6500 documents from 11314
Processed:  7000 documents from 11314
Processed:  7500 documents from 11314
Processed:  8000 documents from 11314
Processed:  8500 documents from 11314
Processed:  9000 documents from 11314
Processed:  9500 documents from 11314
Processed:  10000 documents from 11314
Processed:  10500 documents from 11314
Processed:  11000 documents from 11314


In [35]:
text = nltk.Text(list(itertools.chain.from_iterable(pured_documents)))

In [137]:
class NltkHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        candidates = Counter()
        for word in words:
            for w in self.explain(word)[:5]:
                candidates[w] += 1
        for word in words:
            if word in candidates:
                del candidates[word]
        return [x for x, _ in candidates.most_common(len(candidates))]
    
    def explain(self, word):
        return text._word_context_index.similar_words(word)
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [48]:
from gensim.models import Word2Vec

In [107]:
model = Word2Vec(pured_documents, size=100, window=5, min_count=5, workers=4)

In [125]:
class GensimHatPlayer(object):
    def __init__(self):
        pass
    
    def guess(self, words):
        return [x for x, _ in model.most_similar(positive=words)]
    
    def explain(self, word):
        return [x for x, _ in model.most_similar(positive=[word])]
    
    def guess_result(self, position):
        pass
    
    def explanation_result(self, position):
        pass

In [126]:
words_universe = model.vocab.keys()
words_universe_set = set(model.vocab.keys())

In [127]:
def calc_score(player1, player2, word):
    explanation = [x for x in player1.explain(word) if x in words_universe_set][:10]
    exp_score = 0.
    if word in explanation:
        exp_score -= 1.0
    guess_score = 0.
    if len(explanation) > 0:
        for pref in xrange(10):
            guess = player2.guess(explanation[:pref + 1])[:10]
            try:
                pos = guess.index(word)
                player1.explanation_result(pos)
                player2.guess_result(pos)
                guess_score += 0.9 ** pos
                exp_score += 0.9 ** pos
            except ValueError:
                player1.explanation_result(None)
                player2.guess_result(None)
    return exp_score, guess_score


def play(player1, player2, rounds, seed=42):
    random_gen = np.random.RandomState(seed)
    player1_exp_score = 0.
    player1_guess_score = 0.
    player2_exp_score = 0.
    player2_guess_score = 0.
    for _ in xrange(rounds):
        word = random_gen.choice(words_universe)
        player1_exp, player2_guess = calc_score(player1, player2, word)
        player2_exp, player1_guess = calc_score(player2, player1, word)
        player1_explanation = player1.explain(word)
        player2_explanation = player1.explain(word)
        
        player1_exp_score += player1_exp
        player2_exp_score += player2_exp
        player1_guess_score += player1_guess
        player2_guess_score += player2_guess
        
    return (
        (player1_exp_score, player1_guess_score, player1_guess_score + player1_exp_score),
        (player2_exp_score, player2_guess_score, player2_guess_score + player2_exp_score)
    )

In [130]:
%%time
play(GensimHatPlayer(), GensimHatPlayer(), 1000)

CPU times: user 29.4 s, sys: 8 ms, total: 29.5 s
Wall time: 29.5 s


((7711.607091618994, 7711.607091618994, 15423.214183237988),
 (7711.607091618994, 7711.607091618994, 15423.214183237988))

In [139]:
%%time
play(NltkHatPlayer(), GensimHatPlayer(), 1000)

CPU times: user 19 s, sys: 0 ns, total: 19 s
Wall time: 19 s


((168.55873737700003, 149.20426826800002, 317.763005645),
 (149.20426826800002, 168.55873737700003, 317.763005645))

In [138]:
%%time
play(NltkHatPlayer(), NltkHatPlayer(), 1000)

CPU times: user 2min 28s, sys: 28 ms, total: 2min 28s
Wall time: 2min 28s


((2190.7444470920013, 2190.7444470920013, 4381.488894184003),
 (2190.7444470920013, 2190.7444470920013, 4381.488894184003))