In [1]:
from arff_parser import *

In [2]:
def count_feature(feature_list, feature_col_num, data):
    counts = [0] * len(feature_list)
    for row in data:
        for i in range(len(feature_list)):
            if row[feature_col_num] == feature_list[i]:
                counts[i] += 1
    return counts

In [3]:
a = arff_data("lymph_train.arff")
count_feature(a.all_attributes[-1].attribute_list, -1, a.data)

[57, 43]

In [4]:
a.all_attributes[-1].attribute_list

['metastases', 'malign_lymph']

In [5]:
def count_joint_probability(feature1_list, feature1_col_num, feature2_list, feature2_col_num, data):
    label_counts = count_feature(feature2_list, -1, data)
    feature_counts = []
    for i in range(len(feature2_list)):
        count_list = [0] * len(feature1_list)
        for row in data:
            for j in range(len(feature1_list)):
                if row[feature1_col_num] == feature1_list[j] and row[feature2_col_num] == feature2_list[i]:
                    count_list[j] += 1
        for k in range(len(count_list)):
            count_list[k] = (count_list[k]+1) / (label_counts[i] + len(feature1_list))
        feature_counts.append(count_list)
    return feature_counts

In [6]:
count_joint_probability(a.all_attributes[0].attribute_list, 0, a.all_attributes[-1].attribute_list, -1, a.data)

[[0.01639344262295082,
  0.4426229508196721,
  0.32786885245901637,
  0.21311475409836064],
 [0.02127659574468085,
  0.48936170212765956,
  0.23404255319148937,
  0.2553191489361702]]

In [7]:
23/(len(a.data)+4)

0.22115384615384615

In [8]:
a.all_attributes[-1].attribute_list

['metastases', 'malign_lymph']

In [9]:
a.all_attributes[0].attribute_list

['normal', 'arched', 'deformed', 'displaced']

In [10]:
def calculate_prior(label_feature_list, data):
    counts =  count_feature(label_feature_list, -1, data)
    for i in range(len(counts)):
        # Laplace estimates
        counts[i] = (counts[i] + 1)/(len(data) + len(label_feature_list))
    return counts

In [11]:
calculate_prior(a.label.attribute_list, a.data)

[0.5686274509803921, 0.43137254901960786]

In [12]:
58/(len(a.data)+2)

0.5686274509803921

In [13]:
def get_all_joint_probability(arffdata):
    probabilities = []
    for i in range(len(arffdata.attributes)):
        probabilities.append(count_joint_probability(arffdata.attributes[i].attribute_list, i, arffdata.label.attribute_list, -1, arffdata.data))
    return probabilities

In [14]:
get_all_joint_probability(a)

[[[0.01639344262295082,
   0.4426229508196721,
   0.32786885245901637,
   0.21311475409836064],
  [0.02127659574468085,
   0.48936170212765956,
   0.23404255319148937,
   0.2553191489361702]],
 [[0.2542372881355932, 0.7457627118644068],
  [0.7111111111111111, 0.28888888888888886]],
 [[0.7627118644067796, 0.23728813559322035],
  [0.9111111111111111, 0.08888888888888889]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9555555555555556, 0.044444444444444446]],
 [[0.7288135593220338, 0.2711864406779661],
  [0.8666666666666667, 0.13333333333333333]],
 [[0.4745762711864407, 0.5254237288135594],
  [0.5111111111111111, 0.4888888888888889]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9111111111111111, 0.08888888888888889]],
 [[0.423728813559322, 0.576271186440678],
  [0.08888888888888889, 0.9111111111111111]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9555555555555556, 0.044444444444444446]],
 [[0.08196721311475409,
   0.6065573770491803,
   0.26229508196721313,
   0.04918032

In [15]:
t = arff_data("lymph_test.arff")
t.data[0]

['displaced',
 'no',
 'no',
 'no',
 'no',
 'yes',
 'no',
 'yes',
 '1',
 '4',
 'oval',
 'lacunar',
 'lac_central',
 'stripped',
 'vesicles',
 'yes',
 'yes',
 '2',
 'malign_lymph']

In [16]:
def label_to_index(row, arffdata):
    feature_index = []
    for i in range(len(row)):
        for j in range(len(arffdata.all_attributes[i].attribute_list)):
            if row[i] == arffdata.all_attributes[i].attribute_list[j]:
                feature_index.append(j)
                continue
    return feature_index

In [17]:
label_to_index(t.data[0], t)

[3, 0, 0, 0, 0, 1, 0, 1, 0, 3, 1, 1, 3, 6, 2, 1, 1, 1, 1]

In [18]:
def naive_bayes_learning(arffdata):
    joint_probabilities = get_all_joint_probability(arffdata)
    prior = calculate_prior(arffdata.label.attribute_list, arffdata.data)
    return joint_probabilities, prior

In [19]:
def naive_bayes_inference(row, train_arffdata, joint_probabilities, prior):
    feature_index = label_to_index(row, train_arffdata)
    nominator1 = 1
    nominator2 = 1
    for i in range(len(feature_index)-1):
        nominator1 = nominator1 * joint_probabilities[i][0][feature_index[i]]
        nominator2 = nominator2 * joint_probabilities[i][1][feature_index[i]]
    nominator1 = nominator1 * prior[0]
    nominator2 = nominator2 * prior[1]
    denominator = nominator1 + nominator2
    if nominator1/denominator >= nominator2/denominator:
        return 0
    if nominator1/denominator < nominator2/denominator:
        return 1

In [20]:
j, p = naive_bayes_learning(a)
naive_bayes_inference(t.data[0], a, j, p)

1

In [21]:
for row in t.data:
    print(naive_bayes_inference(row, a, j, p))

1
1
1
0
0
1
1
1
1
1
0
0
1
1
1
1
0
1
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
1
0


In [22]:
j

[[[0.01639344262295082,
   0.4426229508196721,
   0.32786885245901637,
   0.21311475409836064],
  [0.02127659574468085,
   0.48936170212765956,
   0.23404255319148937,
   0.2553191489361702]],
 [[0.2542372881355932, 0.7457627118644068],
  [0.7111111111111111, 0.28888888888888886]],
 [[0.7627118644067796, 0.23728813559322035],
  [0.9111111111111111, 0.08888888888888889]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9555555555555556, 0.044444444444444446]],
 [[0.7288135593220338, 0.2711864406779661],
  [0.8666666666666667, 0.13333333333333333]],
 [[0.4745762711864407, 0.5254237288135594],
  [0.5111111111111111, 0.4888888888888889]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9111111111111111, 0.08888888888888889]],
 [[0.423728813559322, 0.576271186440678],
  [0.08888888888888889, 0.9111111111111111]],
 [[0.9661016949152542, 0.03389830508474576],
  [0.9555555555555556, 0.044444444444444446]],
 [[0.08196721311475409,
   0.6065573770491803,
   0.26229508196721313,
   0.04918032