In [1]:
from thinkbayes import Pmf
import csv
from collections import defaultdict

In [2]:
classval = 'play'
smoothing = 0

In [3]:
featuredict = defaultdict(list)
countdict = defaultdict(int)
classpos = 0
featlist = list()

In [4]:
def conc (tag1, tag2, tag3):
    return '+'.join([tag1,tag2,tag3])

In [5]:
def read_data ():
    global classpos
    global countdict
    global featuredict
    global featlist
    
    pos = 0
    with open('weather.csv', 'r') as csvfile:
        datareader = csv.reader(csvfile, delimiter=',', quotechar='|')
        for row in datareader:
            pos += 1
            if pos == 1:
                featlist = row
                for x in range(len(row)):
                    if row[x] == classval:
                        classpos = x
                        featuredict[row[x]] = list()
            else:
                localclassval = row[classpos]
                countdict[localclassval] += 1
                    
                for x in range(len(featlist)):
                    if row[x] in featuredict[featlist[x]]:
                        countdict[conc(featlist[x],row[x],localclassval)] += 1
                    else:
                        featuredict[featlist[x]].append(row[x])
                        countdict[conc(featlist[x],row[x],localclassval)] = 1

In [6]:
# global smoothing    
# read_data() 

In [7]:
# print(featuredict)
# print(countdict)
# print(classpos)
# print(featlist)

In [8]:
def condprob (classval, feat, featval):
## Modify this function to use the value of smoothing
    total = 0+smoothing*(len(featuredict[feat]))  # This will be the total number of instances of class value = classval
    val = 0+smoothing # this will be the number of times that feat was of value featval when class was classval
    
    for fval in featuredict[feat]:
        count = countdict[conc(feat,fval,classval)] 
        total += count
        if fval == featval:
            val += count
            
    # if featval never occred in the dataset, you need to handle this condition here
    if not(featval in featuredict[feat]):
        total += smoothing
        val = smoothing
#     print(val,total)
    val = float(val)/total # here is where you finally compute the conditional probability
    
    return val

In [9]:
# smoothing = 1
# condprob('yes','outlook' ,'overcast')

In [10]:
def condprob_check(data):
    # This function validates calculations to help identify common problems.
    # If a common problem is identified, this will print an error. 
    # However, if no error is printed, that doesn't guarantee correct code.

    correct_condprobs = {0: # no smoothing
    {'no+outlook+overcast': 0.0,
    'no+temperature+hot': 0.4,
    'no+humidity+normal': 0.2,
    'no+windy+TRUE': 0.6,
    'yes+outlook+overcast': 0.4444444444444444,
    'yes+temperature+hot': 0.2222222222222222,
    'yes+humidity+normal': 0.6666666666666666,
    'yes+windy+TRUE': 0.3333333333333333,
    },
                1:
    {'no+outlook+overcast': 0.125,
    'no+temperature+hot': 0.375,
    'no+humidity+normal': 0.25,
    'no+windy+TRUE': 0.5714285714285714,
    'yes+outlook+overcast': 0.4166666666666667,
    'yes+temperature+hot': 0.25,
    'yes+humidity+normal': 0.5833333333333334,
    'yes+windy+TRUE': 0.36363636363636365,
    }}

    for feature, value in zip(featlist, data[:-1]):
        for classval in ['yes', 'no']:
            if correct_condprobs[smoothing][conc(classval,feature,value)] != condprob(classval,feature,value):
                    print(smoothing,classval,feature,value)
                    print("correct: ", correct_condprobs[smoothing][conc(classval,feature,value)])
                    print("mine:",condprob(classval,feature,value))
                    print("Conditional probability validation failed")
#                     return

In [11]:
class Weather(Pmf):
    """A map from whether you play tennis or not to a probablity."""

    def __init__(self, hypos):
        """Initialize self.

        hypos: whether you play tennis or not 
        """
        Pmf.__init__(self)
        for hypo in hypos:
            self.Set(hypo, countdict[hypo])
        self.Normalize()

    def Update(self, data):
        total = 0
        """Updates the PMF with new data.

        data: feature values for outlook, temperature, humidity, and windy
        """
        for hypo in self.Values():
            like = self.Likelihood(data, hypo)
            total += like
            self.Mult(hypo, like)
        if total > 0: self.Normalize()
    # Currently this function computes the likelihood from just one conditional probability
    # You need to modify it to properly take the full set of features into account.
    def Likelihood(self, data, hypo):
        """The likelihood of the data under the hypothesis.

        data: feature values for outlook, temperature, humidity, and windy
        hypo: whether you play tennis or not
        """
        like = 1
        # in the Cookie problem, there was only one feature.  So the likelihood before
        # it multiplied by the prior probability was just the conditional probability
        # of the feature value given the class value, which you saw in the mix variable
        # That would be like just considering one of the 4 features we have, which I have
        # done below.  Change the code so that you take all f4 features into account
        # according to what we discussed in class and you saw in the Witten book for
        # computing the likelihood for the play tennis dataset

        if data == ['overcast','hot','normal','TRUE','?']:
            condprob_check(data)

        for i in range(4):
            like = like * condprob(hypo, featlist[i], data[i]) 
                                                    # this is the conditional probability of
                                                    # the value (in data) of the feature (in
                                                    # featlist) given the class value (in hypo)
                                                    # note that featlist lists all of the features
                                                    # and data lists all the values for this instance
        return like

In [12]:
def test_instance (hypos, data):
    pmf = Weather(hypos)
    pmf.Update(data)
    
    print("--------------------------")
    print(data)
    if smoothing: 
        print("Smoothing")
    else: 
        print("No Smoothing")
        
    for hypo, prob in sorted(pmf.Items(), reverse=True):
        print(hypo, prob)

In [13]:
def main():
    global smoothing
    
    read_data() 
    hypos = featuredict[classval]    
    
    smoothing = 0
    test_instance(hypos, ['overcast','hot','normal','TRUE','?'])
    test_instance(hypos, ['rainy','hot','high','FALSE','?'])
    test_instance(hypos, ['overcast','cool','normal','TRUE','?'])
    test_instance(hypos, ['rainy','mild','low','FALSE','?'])
    smoothing = 1
    test_instance(hypos, ['overcast','hot','normal','TRUE','?'])
    test_instance(hypos, ['rainy','hot','high','FALSE','?'])
    test_instance(hypos, ['overcast','cool','normal','TRUE','?'])
    test_instance(hypos, ['rainy','mild','low','FALSE','?'])

In [14]:
main()

--------------------------
['overcast', 'hot', 'normal', 'TRUE', '?']
No Smoothing
yes 1.0
no 0.0
--------------------------
['rainy', 'hot', 'high', 'FALSE', '?']
No Smoothing
yes 0.3665689149560116
no 0.6334310850439885
--------------------------
['overcast', 'cool', 'normal', 'TRUE', '?']
No Smoothing
yes 0.9999999999999999
no 0.0
--------------------------
['rainy', 'mild', 'low', 'FALSE', '?']
No Smoothing
yes 0.0
no 0.0
1 yes humidity normal
correct:  0.5833333333333334
mine: 0.6363636363636364
Conditional probability validation failed
1 no humidity normal
correct:  0.25
mine: 0.2857142857142857
Conditional probability validation failed
1 yes humidity normal
correct:  0.5833333333333334
mine: 0.6363636363636364
Conditional probability validation failed
1 no humidity normal
correct:  0.25
mine: 0.2857142857142857
Conditional probability validation failed
--------------------------
['overcast', 'hot', 'normal', 'TRUE', '?']
Smoothing
yes 0.8500619578686492
no 0.14993804213135065
--