Skip to content

Commit

Permalink
Use a min heap to store high frequencies
Browse files Browse the repository at this point in the history
  • Loading branch information
nicolaasuni committed Oct 30, 2018
1 parent 696a6bb commit 40e9b9d
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 81 deletions.
112 changes: 69 additions & 43 deletions src/wordfreq.h
Original file line number Diff line number Diff line change
Expand Up @@ -143,13 +143,12 @@ static inline void free_trie_node(trie_node_t *node)
*/
typedef struct hifreq_item_t
{
trie_node_t *node; //!< Pointers to trie leaf node.
trie_node_t *node; //!< Pointer to trie leaf node.
char word[MAX_WORD_LENGTH]; //!< Word.
} hifreq_item_t;

/**
* Struct containing the list of high-frequency words.
* The list is sorted in descending order.
* Struct containing the list of high-frequency words (min heap).
* The maximum frequency word is at position 1.
*/
typedef struct hifreq_t
Expand Down Expand Up @@ -197,47 +196,47 @@ static inline void free_hifreq(hifreq_t *hf)
}

/**
* Add new node in hifreq.
* Swap two hifreq items.
*
* @param hf Pointer to hifreq object.
* @param node Pointer to a trie node.
* @param word Current word.
* @param hf Pointer to hifreq object.
* @param a Position of the first item.
* @param b Position of the second item.
*/
static inline void insert_hifreq_node(hifreq_t *hf, trie_node_t *node, const char *word)
static void swap_items(hifreq_t *hf, uint8_t a, uint8_t b)
{
++(hf->count);
uint8_t i, j;
for (i = 1; ((i < hf->count) && (node->freq < hf->item[i].node->freq)); i++) {};
for (j = hf->count; j > i; j--)
{
hf->item[j] = hf->item[(j - 1)];
hf->item[j].node->hfidx = j;
}
node->hfidx = i;
hf->item[i].node = node;
memcpy(hf->item[i].word, word, MAX_WORD_LENGTH);
hf->item[a].node->hfidx = b;
hf->item[b].node->hfidx = a;
hifreq_item_t tmp = hf->item[a];
hf->item[a] = hf->item[b];
hf->item[b] = tmp;
}


/**
* Update the frequency of an existing hifreq node.
* NOTE: frequency always increase.
* Heapify the min heap.
*
* @param hf Pointer to hifreq object.
* @param node Pointer to a trie node.
* @param word Current word.
* @param hf Pointer to hifreq object.
* @param idx Item position.
*/
static inline void update_hifreq_node(hifreq_t *hf, trie_node_t *node, const char *word)
static void heapify(hifreq_t *hf, uint8_t idx)
{
uint8_t i = node->hfidx;
while ((i > 1) && (node->freq > hf->item[(i - 1)].node->freq))
uint8_t left, right, small;
left = (2 * idx);
right = (left + 1);
small = idx;
if ((left <= hf->count) && (hf->item[left].node->freq < hf->item[small].node->freq))
{
small = left;
}
if ((right <= hf->count) && (hf->item[right].node->freq < hf->item[small].node->freq))
{
small = right;
}
if (small != idx)
{
hf->item[i] = hf->item[(i - 1)];
hf->item[i].node->hfidx = i;
--i;
swap_items(hf, small, idx);
heapify(hf, small);
}
node->hfidx = i;
hf->item[i].node = node;
memcpy(hf->item[i].word, word, MAX_WORD_LENGTH);
}

/**
Expand All @@ -252,24 +251,52 @@ static inline void update_hifreq(hifreq_t *hf, trie_node_t *node, const char *wo
// update existing node (word)
if (node->hfidx != 0)
{
update_hifreq_node(hf, node, word);
heapify(hf, node->hfidx);
return;
}
// add new node (word) - insert sort
if (hf->count < hf->size)
{
insert_hifreq_node(hf, node, word);
++(hf->count);
node->hfidx = hf->count;
hf->item[hf->count].node = node;
memcpy(hf->item[hf->count].word, word, MAX_WORD_LENGTH);
for (uint8_t i = (hf->count / 2); i > 0; --i)
{
heapify(hf, i);
}
return;
}
// replace min frequency node (word)
if (node->freq > hf->item[hf->count].node->freq)
if (node->freq > hf->item[1].node->freq)
{
hf->item[hf->count].node->hfidx = 0;
hf->item[hf->count].node = NULL;
hf->item[hf->count].word[0] = 0;
hf->item[1].node->hfidx = 0;
node->hfidx = 1;
hf->item[1].node = node;
memcpy(hf->item[1].word, word, MAX_WORD_LENGTH);
heapify(hf, node->hfidx);
}
}

/**
* Reorder the items in descending order.
*
* @param hf hifreq object.
*/
static inline void order_hifreq(hifreq_t *hf)
{
uint8_t count = hf->count;
while (hf->count > 2)
{
swap_items(hf, 1, hf->count);
--(hf->count);
insert_hifreq_node(hf, node, word);
heapify(hf, 1);
}
if (hf->count == 2)
{
swap_items(hf, 1, 2);
}
hf->count = count;
}

/**
Expand Down Expand Up @@ -316,14 +343,13 @@ static inline void parse_data(const uint8_t *src, uint64_t size, trie_node_t *ro
node->isend = true;
++(node->freq);
root->isend = false;
order_hifreq(hf);
}

/**
* Print the high frequency words.
*
* @param node Pointer to a trie node.
* @param str String buffer.
* @param pos Character position inside the string.
* @param hf hifreq object.
*/
static inline void print_hifreq(hifreq_t *hf)
{
Expand Down
63 changes: 25 additions & 38 deletions test/test_wordfreq.c
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,10 @@ int test_wordfreq()
return 0;
}

int test_parse_data()
int test_parse_data(const char *file, uint32_t *freq, uint8_t k)
{
// memory-map the input file
mmfile_t mf = {0,0,0};
char file[] = "mobydick.txt";
mmap_file(file, &mf);
if (mf.fd < 0)
{
Expand All @@ -51,7 +50,6 @@ int test_parse_data()
return 1;
}

static const uint8_t k = 100;
hifreq_t *hf = new_hifreq(k);
if (!hf)
{
Expand All @@ -62,20 +60,7 @@ int test_parse_data()
parse_data(mf.src, mf.size, root, hf);

int errors = 0;
uint32_t freq[] =
{
0,
4284, 2192, 2185, 1861, 1685, 1366, 1056, 1024, 889, 821,
783, 616, 603, 595, 577, 564, 551, 542, 541, 458,
452, 419, 410, 384, 366, 362, 362, 347, 345, 342,
327, 320, 288, 274, 264, 263, 259, 254, 234, 231,
227, 214, 212, 211, 199, 199, 192, 192, 191, 189,
186, 186, 184, 182, 179, 178, 178, 176, 171, 168,
167, 163, 161, 156, 154, 152, 149, 147, 147, 143,
142, 138, 132, 129, 127, 126, 126, 125, 124, 124,
123, 123, 122, 122, 119, 119, 117, 112, 111, 109,
108, 107, 104, 104, 101, 101, 100, 98, 97, 97,
};

if (hf->count != k)
{
fprintf(stderr, "%s ERROR: expected (%" PRIu8 ") results, got %" PRIu8 "\n", __func__, k, hf->count);
Expand Down Expand Up @@ -111,27 +96,29 @@ int main()
int errors = 0;

errors += test_wordfreq();
errors += test_parse_data();

return errors;
}


















uint32_t freq[] =
{
0,
4284, 2192, 2185, 1861, 1685, 1366, 1056, 1024, 889, 821,
783, 616, 603, 595, 577, 564, 551, 542, 541, 458,
452, 419, 410, 384, 366, 362, 362, 347, 345, 342,
327, 320, 288, 274, 264, 263, 259, 254, 234, 231,
227, 214, 212, 211, 199, 199, 192, 192, 191, 189,
186, 186, 184, 182, 179, 178, 178, 176, 171, 168,
167, 163, 161, 156, 154, 152, 149, 147, 147, 143,
142, 138, 132, 129, 127, 126, 126, 125, 124, 124,
123, 123, 122, 122, 119, 119, 117, 112, 111, 109,
108, 107, 104, 104, 101, 101, 100, 98, 97, 97,
};
errors += test_parse_data("mobydick.txt", freq, 100);

uint32_t freq2[] =
{
0,
10, 9, 8, 7, 6, 5, 4, 3, 2, 2
};
errors += test_parse_data("test01.txt", freq2, 10);

return errors;
}

0 comments on commit 40e9b9d

Please sign in to comment.