Skip to content

Commit

Permalink
Added count optimization. Counter are build during training phase and…
Browse files Browse the repository at this point in the history
… not recalculate on each classification.

Added comment
refactored wcount ccount
  • Loading branch information
Oliviergg committed Apr 23, 2012
1 parent 2089747 commit dee912b
Show file tree
Hide file tree
Showing 6 changed files with 85 additions and 40 deletions.
73 changes: 54 additions & 19 deletions lib/stuff-classifier/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,12 @@ def initialize(name, opts={})
purge_state = opts[:purge_state]

@name = name
@wcount = {}
@ccount = {}

@word_count = {}
@category_count = {}
@training_count=0


@ignore_words = nil
@tokenizer = StuffClassifier::Tokenizer.new(opts)

Expand All @@ -28,36 +32,67 @@ def initialize(name, opts={})
end

def incr_word(word, category)
@wcount[word] ||= {}
@wcount[word][category] ||= 0
@wcount[word][category] += 1
@word_count[word] ||= {}

@word_count[word][category] ||= 0
@word_count[word][category] += 1

@word_count[word][:_total_word] ||= 0
@word_count[word][:_total_word] += 1


# Total word count
@word_count[:_total_word]||=0
@word_count[:_total_word]+=1

# words count by categroy
@category_count[category] ||= {}
@category_count[category][:_total_word] ||= 0
@category_count[category][:_total_word] += 1

end

def incr_cat(category)
@ccount[category] ||= 0
@ccount[category] += 1
@category_count[category] ||= {}
@category_count[category][:_count] ||= 0
@category_count[category][:_count] += 1

@training_count ||= 0
@training_count += 1

end

# return number of time the word appears in a category
def word_count(word, category)
return 0.0 unless @wcount[word] && @wcount[word][category]
@wcount[word][category].to_f
return 0.0 unless @word_count[word] && @word_count[word][category]
@word_count[word][category].to_f
end

def cat_count(category)
@ccount[category] ? @ccount[category].to_f : 0.0

# return the number of time the word appears in all categories
def total_word_count(word)
return 0.0 unless @word_count[word] && @word_count[word][:_total_word]
@word_count[word][:_total_word].to_f
end

def total_in_cat(category)
# this has to be optimized
@wcount.find_all{|k,v| v.member? category}.map{|k,v| v[category]}.inject(0){|a,b| a+b}
def total_word_count_in_cat(cat)
p cat
p @category_count
return 0.0 unless @category_count[cat] && @category_count[cat][:_total_word]
@category_count[cat][:_total_word].to_f
end

# return the number of categories
def total_count
@ccount.values.inject(0){|s,c| s + c}.to_f
@training_count
end

# return the training document count for a category
def cat_count(category)
@category_count[category][:_count] ? @category_count[category][:_count].to_f : 0.0
end

def categories
@ccount.keys
@category_count.keys
end

def train(category, text)
Expand All @@ -66,7 +101,7 @@ def train(category, text)
end

def word_prob(word, cat)
total_words_in_cat = total_in_cat(cat)
total_words_in_cat = total_word_count_in_cat(cat)
return 0.0 if total_words_in_cat == 0
word_count(word, cat).to_f / total_words_in_cat
end
Expand All @@ -79,7 +114,7 @@ def word_weighted_average(word, cat, opts={})

# count the number of times this word has appeared in all
# categories
totals = categories.map{|c| word_count(word, c)}.inject(0){|s,c| s + c}
totals = total_word_count(word)

# the final weighted average
(@weight * @assumed_prob + totals * basic_prob) / (@weight + totals)
Expand Down
7 changes: 7 additions & 0 deletions lib/stuff-classifier/bayes.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,13 @@ class StuffClassifier::Bayes < StuffClassifier::Base

attr_writer :thresholds

# opts :
# language
# stemming : true | false
# weight
# assumed_prob
# storage
# purge_state ?
def initialize(name, opts={})
super(name, opts)
@thresholds = {}
Expand Down
27 changes: 15 additions & 12 deletions lib/stuff-classifier/storage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,22 @@ def initialize

def load_state(classifier)
if @storage.key? classifier.name
_wcount, _ccount = @storage[classifier.name]
_word_count, _category_count,_word_total,_category_total = @storage[classifier.name]
classifier.instance_eval do
@wcount = _wcount
@ccount = _ccount
@word_count = _word_count
@category_count = _category_count

@word_total = _word_total
@category_total = _category_total
end
end
end

def save_state(classifier)
name = classifier.name
wcount = classifier.instance_variable_get :@wcount
ccount = classifier.instance_variable_get :@ccount
@storage[name] = [wcount, ccount]
word_count = classifier.instance_variable_get :@word_count
category_count = classifier.instance_variable_get :@category_count
@storage[name] = [word_count, category_count]
end

def purge_state(classifier)
Expand All @@ -40,19 +43,19 @@ def load_state(classifier)
end

if @storage.key? classifier.name
_wcount, _ccount = @storage[classifier.name]
_word_count, _category_count = @storage[classifier.name]
classifier.instance_eval do
@wcount = _wcount
@ccount = _ccount
@word_count = _word_count
@category_count = _category_count
end
end
end

def save_state(classifier)
name = classifier.name
wcount = classifier.instance_variable_get :@wcount
ccount = classifier.instance_variable_get :@ccount
@storage[name] = [wcount, ccount]
word_count = classifier.instance_variable_get :@word_count
category_count = classifier.instance_variable_get :@category_count
@storage[name] = [word_count, category_count]
_write_to_file
end

Expand Down
2 changes: 1 addition & 1 deletion lib/stuff-classifier/tf-idf.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ def tf_idf(word, cat)
tf = 1.0 * word_cat_nr / cat_nr

total_categories = categories.length
categories_with_word = (@wcount[word] || []).length
categories_with_word = (@word_count[word] || []).length

idf = Math.log10((total_categories + 2) / (categories_with_word + 1.0))
return tf * idf
Expand Down
8 changes: 4 additions & 4 deletions test/test_004_in_memory_storage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,16 @@ def test_for_persistance
StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
test.assert @storage.instance_of?(StuffClassifier::InMemoryStorage),
"@storage should be an instance of FileStorage"
test.assert @wcount.length > 0, "Word count should be persisted"
test.assert @ccount.length > 0, "Category count should be persisted"
test.assert @word_count.length > 0, "Word count should be persisted"
test.assert @category_count.length > 0, "Category count should be persisted"
end
end

def test_purge_state
test = self
StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
test.assert @wcount.length == 0, "Word count should be purged"
test.assert @ccount.length == 0, "Category count should be purged"
test.assert @word_count.length == 0, "Word count should be purged"
test.assert @category_count.length == 0, "Category count should be purged"
end
end
end
8 changes: 4 additions & 4 deletions test/test_005_file_storage.rb
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ def test_for_persistance
StuffClassifier::Bayes.new("Cats or Dogs").instance_eval do
test.assert @storage.instance_of?(StuffClassifier::FileStorage),
"@storage should be an instance of FileStorage"
test.assert @wcount.length > 0, "Word count should be persisted"
test.assert @ccount.length > 0, "Category count should be persisted"
test.assert @word_count.length > 0, "Word count should be persisted"
test.assert @category_count.length > 0, "Category count should be persisted"
end
end

Expand All @@ -47,8 +47,8 @@ def test_purge_state
StuffClassifier::Bayes.new("Cats or Dogs", :purge_state => true).instance_eval do
test.assert @storage.instance_of?(StuffClassifier::FileStorage),
"@storage should be an instance of FileStorage"
test.assert @wcount.length == 0, "Word count should be purged"
test.assert @ccount.length == 0, "Category count should be purged"
test.assert @word_count.length == 0, "Word count should be purged"
test.assert @category_count.length == 0, "Category count should be purged"
end
end
end

0 comments on commit dee912b

Please sign in to comment.