-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathnaive_bayes.c
124 lines (108 loc) · 3.13 KB
/
naive_bayes.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#include <stdio.h>
#include <stdlib.h>
// Use iris.data with 4 features and 3 labels
#define N_FEATURES 4
#define N_LABELS 3
#define MAX_LENGTH 1000
#define INF 1000000000
// shuffle
void random_shuffle(int * array, int len)
{
int * p = array, temp, pos;
for (int i = 1; i < len; ++i)
{
pos = rand() % i;
temp = *p;
*p++ = array[pos];
array[pos] = temp;
}
}
int main() {
int i, j, t, length = 0, num, hit_num;
float x[MAX_LENGTH][N_FEATURES], x_shuffled[MAX_LENGTH][N_FEATURES];
int y[MAX_LENGTH], y_shuffled[MAX_LENGTH];
int seed[MAX_LENGTH];
// Load file from csv
FILE* in_file = fopen("dataset/iris.csv", "r");
while (1) {
for (j = 0; j < N_FEATURES; j++) {
fscanf(in_file, "%f,", &x[length][j]);
}
num = fscanf(in_file, "%d\n", &y[length]);
if (num != 1) {
break;
}
length++;
}
fclose(in_file);
// K-fold divide data
for (i = 0; i < length; i++) {
seed[i] = i;
}
random_shuffle(seed, length);
for (i = 0; i < length; i++) {
for (j = 0; j < N_FEATURES; j++) {
x_shuffled[i][j] = x[seed[i]][j];
}
y_shuffled[i] = y[seed[i]];
}
int n_fold = 5;
float ratio = 1 / (float)n_fold;
int split_length = (int)(ratio * length);
// y_hat is the predicted label
int *y_hat = (int*)malloc(sizeof(int) * split_length);
// Start NB(simplified)
float *px_y = (float*)malloc(sizeof(float) * N_FEATURES);
float py;
float hat_value;
float max_value = -INF;
int label;
int max_label;
int x_counter, y_counter;
for (i = 0; i < split_length; i++) {
max_value = -INF;
max_label = -1;
// Find the labels with max probability
for (label = 0; label < N_LABELS; label++) {
x_counter = 0;
y_counter = 0;
// Calculate the ratio of each label
for (j = split_length; j < length; j++) {
if (y_shuffled[j] == label) {
y_counter++;
if (x_shuffled[j][label] == x_shuffled[i][label]) {
x_counter++;
}
}
}
py = (float)y_counter / (length - split_length);
if (x_counter == 0) {
px_y[label] = (x_counter + 1) / (float)(y_counter + 1);
}
else {
px_y[label] = x_counter / (float)y_counter;
}
// Find the labels with max probability
hat_value = 1.0;
for (j = 0; j < N_FEATURES; j++) {
hat_value *= px_y[j];
}
hat_value *= py;
if (hat_value > max_value) {
max_value = hat_value;
max_label = label;
}
}
y_hat[i] = max_label;
}
// Calculate accuracy
hit_num = 0;
for (i = 0; i < split_length; i++) {
if (y_hat[i] == y_shuffled[i]) {
hit_num++;
}
}
printf("Final accuracy is %.2f\n", (float)hit_num / split_length);
free(px_y);
free(y_hat);
}