In [82]:
word_meaning = {1, 2, 7}
# this word can refer to entities 1, 2, and 7

In [83]:
class Shorthair:
    def __init__(self, name):
        self.name = name
        self.personality = 'strange'

    def speak(self):
        print(self.name + ' goes, "Meow."')

Sauce = Shorthair('Saucy')
Ketchup = Shorthair('Ketchup')
Mayo = Shorthair('Mayo')

In [84]:
cat = {Sauce, Ketchup, Mayo}
# cat is a noun, and it can refer to the shorthair cats, Sauce, Ketchup, and Mayo
# for this example, the set of possible entities that cat can refer to is Sauce, Ketchup, and Mayo

In [85]:
cat = {Sauce, Ketchup, Mayo, Mayo}
cat

{<__main__.Shorthair at 0x10a170dd0>,
 <__main__.Shorthair at 0x114ab72d0>,
 <__main__.Shorthair at 0x114ab7f50>}

In [86]:
from numpy import prod

%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf')

  set_matplotlib_formats('svg', 'pdf')


In [87]:
def normalize_probabilities(probs):
    total = sum(probs)
    normalized_probs = []
    for p in probs:
        normalized_probs.append(p / total)
    return normalized_probs

# sum every probability in a list of probs, e.g. [0.2, 0.3, 0.4, 0.5] 
# for every probability in that list, divide it by the calculated sum
# 0.2 / (0.2 + 0.3 + 0.4 + 0.5) = 0.2 / 0.9 = 0.2 repeating

In [88]:
# learners consider hypotheses of word meanings

class Longhair:
    def __init__(self, name):
        self.name = name
        self.personality = 'fanciful'

Hiccup = Longhair('Hiccup')

cat_hypothesis_space = [{set[Shorthair]}, {set[Shorthair], set[Longhair]}]
# the word cat can refer to the set of Shorthairs, OR the set of shorthairs and the set of longhairs, that includes Hiccup

In [89]:
all_hypotheses = [{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},
         {0,1},{1,2},{2,3},{3,4},{4,5},{5,6},{6,7},{7,8},{8,9},{9,10},
         {0,1,2},{1,2,3},{2,3,4},{3,4,5},{4,5,6},{5,6,7},{6,7,8},{7,8,9},{8,9,10},
         {0,1,2,3},{1,2,3,4},{2,3,4,5},{3,4,5,6},{4,5,6,7},{5,6,7,8},{6,7,8,9},{7,8,9,10},
         {0,1,2,3,4},{1,2,3,4,5},{2,3,4,5,6},{3,4,5,6,7},{4,5,6,7,8},{5,6,7,8,9},{6,7,8,9,10},
         {0,1,2,3,4,5},{1,2,3,4,5,6},{2,3,4,5,6,7},{3,4,5,6,7,8},{4,5,6,7,8,9},{5,6,7,8,9,10}]

all_hypotheses
# assumptions:
# 1. there are 10 entities in this world, 0 - 10
# 2. a word refers to at least one of those entities, and up to (inclusive) 6 entities
# 3. hypotheses include entities that are clustered; in other words, 0 is clustered with entities 1, 2, 3, 4, 5

[{0},
 {1},
 {2},
 {3},
 {4},
 {5},
 {6},
 {7},
 {8},
 {9},
 {10},
 {0, 1},
 {1, 2},
 {2, 3},
 {3, 4},
 {4, 5},
 {5, 6},
 {6, 7},
 {7, 8},
 {8, 9},
 {9, 10},
 {0, 1, 2},
 {1, 2, 3},
 {2, 3, 4},
 {3, 4, 5},
 {4, 5, 6},
 {5, 6, 7},
 {6, 7, 8},
 {7, 8, 9},
 {8, 9, 10},
 {0, 1, 2, 3},
 {1, 2, 3, 4},
 {2, 3, 4, 5},
 {3, 4, 5, 6},
 {4, 5, 6, 7},
 {5, 6, 7, 8},
 {6, 7, 8, 9},
 {7, 8, 9, 10},
 {0, 1, 2, 3, 4},
 {1, 2, 3, 4, 5},
 {2, 3, 4, 5, 6},
 {3, 4, 5, 6, 7},
 {4, 5, 6, 7, 8},
 {5, 6, 7, 8, 9},
 {6, 7, 8, 9, 10},
 {0, 1, 2, 3, 4, 5},
 {1, 2, 3, 4, 5, 6},
 {2, 3, 4, 5, 6, 7},
 {3, 4, 5, 6, 7, 8},
 {4, 5, 6, 7, 8, 9},
 {5, 6, 7, 8, 9, 10}]

In [90]:
def calculate_prior(possible_hypotheses):
    prior = []
    for h in possible_hypotheses:
        prior.append(1/len(possible_hypotheses))
    return prior

In [91]:
calculate_prior(all_hypotheses)

[0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431372549,
 0.0196078431

In [92]:
input = [7, 7, 7, 7, 8]

In [93]:
def likelihood(data, hypothesis):
    likelihoods = []
    for data_item in data:
        if data_item in hypothesis:
            likelihood_this_item = 1/len(hypothesis)
        else: 
            likelihood_this_item = 0
        likelihoods.append(likelihood_this_item)
    return prod(likelihoods)

In [94]:
print(likelihood([0, 6, 7, 8], {7, 8, 9}))
print(likelihood([6, 7], {7, 8, 9}))
print(likelihood([6, 7], {7, 8, 9}))

0.0
0.0
0.0


In [95]:
def posterior(data, possible_hypotheses, prior):
    posteriors = []
    for i in range(len(possible_hypotheses)):
        h = possible_hypotheses[i]
        prior_h = prior[i]
        likelihood_h = likelihood(data, h)
        posterior_h = prior_h * likelihood_h
        posteriors.append(posterior_h)
    return normalize_probabilities(posteriors)

In [96]:
my_prior = calculate_prior(all_hypotheses)
posterior([0, 0, 1], all_hypotheses, my_prior)

[0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.6568863586599518,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.19463299515850427,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.08211079483249398,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.042040726954236926,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.024329124394813034,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0]

Questions:

1. How does the amount of data influence the posterior distribution? For instance, is the posterior the same after seeing the data [0, 0, 1] and the data [0, 0, 1, 0, 0, 1]? 

In [100]:
my_posterior1 = posterior([0, 0, 1], all_hypotheses, my_prior)
for i in range(len(all_hypotheses)):
    print(all_hypotheses[i], my_posterior1[i])

{0} 0.0
{1} 0.0
{2} 0.0
{3} 0.0
{4} 0.0
{5} 0.0
{6} 0.0
{7} 0.0
{8} 0.0
{9} 0.0
{10} 0.0
{0, 1} 0.6568863586599518
{1, 2} 0.0
{2, 3} 0.0
{3, 4} 0.0
{4, 5} 0.0
{5, 6} 0.0
{6, 7} 0.0
{8, 7} 0.0
{8, 9} 0.0
{9, 10} 0.0
{0, 1, 2} 0.19463299515850427
{1, 2, 3} 0.0
{2, 3, 4} 0.0
{3, 4, 5} 0.0
{4, 5, 6} 0.0
{5, 6, 7} 0.0
{8, 6, 7} 0.0
{8, 9, 7} 0.0
{8, 9, 10} 0.0
{0, 1, 2, 3} 0.08211079483249398
{1, 2, 3, 4} 0.0
{2, 3, 4, 5} 0.0
{3, 4, 5, 6} 0.0
{4, 5, 6, 7} 0.0
{8, 5, 6, 7} 0.0
{8, 9, 6, 7} 0.0
{8, 9, 10, 7} 0.0
{0, 1, 2, 3, 4} 0.042040726954236926
{1, 2, 3, 4, 5} 0.0
{2, 3, 4, 5, 6} 0.0
{3, 4, 5, 6, 7} 0.0
{4, 5, 6, 7, 8} 0.0
{5, 6, 7, 8, 9} 0.0
{6, 7, 8, 9, 10} 0.0
{0, 1, 2, 3, 4, 5} 0.024329124394813034
{1, 2, 3, 4, 5, 6} 0.0
{2, 3, 4, 5, 6, 7} 0.0
{3, 4, 5, 6, 7, 8} 0.0
{4, 5, 6, 7, 8, 9} 0.0
{5, 6, 7, 8, 9, 10} 0.0


In [98]:
my_posterior2 = posterior([0, 0, 1, 0, 0, 1], all_hypotheses, my_prior)
for i in range(len(all_hypotheses)):
    print(all_hypotheses[i], my_posterior2[i])

{0} 0.0
{1} 0.0
{2} 0.0
{3} 0.0
{4} 0.0
{5} 0.0
{6} 0.0
{7} 0.0
{8} 0.0
{9} 0.0
{10} 0.0
{0, 1} 0.9018073901245205
{1, 2} 0.0
{2, 3} 0.0
{3, 4} 0.0
{4, 5} 0.0
{5, 6} 0.0
{6, 7} 0.0
{8, 7} 0.0
{8, 9} 0.0
{9, 10} 0.0
{0, 1, 2} 0.07917101916045172
{1, 2, 3} 0.0
{2, 3, 4} 0.0
{3, 4, 5} 0.0
{4, 5, 6} 0.0
{5, 6, 7} 0.0
{8, 6, 7} 0.0
{8, 9, 7} 0.0
{8, 9, 10} 0.0
{0, 1, 2, 3} 0.014090740470695633
{1, 2, 3, 4} 0.0
{2, 3, 4, 5} 0.0
{3, 4, 5, 6} 0.0
{4, 5, 6, 7} 0.0
{8, 5, 6, 7} 0.0
{8, 9, 6, 7} 0.0
{8, 9, 10, 7} 0.0
{0, 1, 2, 3, 4} 0.0036938030699500374
{1, 2, 3, 4, 5} 0.0
{2, 3, 4, 5, 6} 0.0
{3, 4, 5, 6, 7} 0.0
{4, 5, 6, 7, 8} 0.0
{5, 6, 7, 8, 9} 0.0
{6, 7, 8, 9, 10} 0.0
{0, 1, 2, 3, 4, 5} 0.001237047174382058
{1, 2, 3, 4, 5, 6} 0.0
{2, 3, 4, 5, 6, 7} 0.0
{3, 4, 5, 6, 7, 8} 0.0
{4, 5, 6, 7, 8, 9} 0.0
{5, 6, 7, 8, 9, 10} 0.0


After seeing the data, [0, 0, 1], the hypothesis {0, 1} has a posterior probability of 0.6568863586599518. After seeing the data, [0, 0, 1, 0, 0, 1], the hypothesis {0, 1} has a posterior probability of 0.9018073901245205.
The posterior probability of hypothesis {0, 1} increases after more information (i.e., data) is seen.
The posterior probability of hypothesis {0, 1, 2} decreases after more information: after [0, 0, 1] it is 0.19463299515850427, and after [0, 0, 1, 0, 0, 1] it is 0.07917101916045172.
In effect, the hypothesis that predicts fewer word meanings is preferred after more data.
This reflects the size principle: 

$p(X|h)=[\frac{1}{size(h)}]^n$

Hypotheses that predict the word has fewer meanings (i.e., that have smaller extensions), assign exponentially greater probability to the same data than do hypotheses that predict the word has many more meanings. 

2. When are more specific word meanings preferred? When are more general word meanings preferred?

The most specific word meanings that explain (or, predict) the observed data are always preferred: they have fewer extensions, and if these extensions occur in the observed data, they are preferred over hypotheses that maintain extensions that did not occur in the observed data.

3. This code calculates a probability distribution over possible hypotheses given some data. If you had to commit to a single hypothesis, how would you choose one?

I would choose one that explains the data best. Given that I have no (known) preference for any of the possible hypotheses (in other words, equal priors assigned to each hypothesis), I would pick the hypothesis with the greatest likelihood: given the data, the hypothesis that best predicted the observed data. 

In this case, I pick the hypothesis with the greatest posterior probability. I pick the MAP hypothesis. 

If I do not pick the MAP hypothesis, I could pick a random hypothesis, despite the fact that there is one specific hypothesis with the greatest MAP. However, I would be more likely to select hypotheses that have greater posterior probabilities. In effect, I would more often select from hypotheses with posterior probabilities of 0.8, 0.9, etc, but I could select one with a low posterior probability.

4. Do we have any kind of innateness in our model? Are there word meanings that our model learner could never learn, no matter what kind of data we gave them?

If innateness means what the learner brings to the task of learning word meanings, 