## simple naive bayes example

https://monkeylearn.com/blog/practical-explanation-naive-bayes-classifier/

In [1]:
import pandas

In [2]:
# read in a data file that represnts sport and non-sports sentences
df_sentences = pandas.read_csv("sport_text.csv")
df_sentences

Unnamed: 0,Text,Category
0,A great game,Sports
1,The election was over,Not sports
2,Very clean match,Sports
3,A clean but forgettable game,Sports
4,It was a close election,Not sports


In [3]:
# want to know if this sentence is a sport or non-sports commemt
test_sentence = "A very close game"
test_words = test_sentence.lower().split()

test_words

['a', 'very', 'close', 'game']

In [4]:
# number of words
total_words = 0
for ii,rr in df_sentences.iterrows():
    print(rr.Text)
    total_words +=  len(rr.Text.split())

print(total_words)

A great game
The election was over
Very clean match
A clean but forgettable game
It was a close election
20


In [5]:
# get a data frame of just words 
# number of word
total_words = 0
df_words = pandas.DataFrame()
for ii,rr in df_sentences.iterrows():
    print(rr.Text)
    words =  rr.Text.lower().split()
    for w in words:
        dd = { "word": w, "Category": rr.Category, "count": 1}
        df_words = df_words.append(dd, ignore_index=True)
df_words

A great game
The election was over
Very clean match
A clean but forgettable game
It was a close election


Unnamed: 0,Category,count,word
0,Sports,1.0,a
1,Sports,1.0,great
2,Sports,1.0,game
3,Not sports,1.0,the
4,Not sports,1.0,election
5,Not sports,1.0,was
6,Not sports,1.0,over
7,Sports,1.0,very
8,Sports,1.0,clean
9,Sports,1.0,match


In [6]:

df_words_sum = df_words.pivot_table(index='word', values='count', columns='Category', aggfunc=sum, fill_value=0 )
df_words_sum

Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,1,2
but,0,1
clean,0,2
close,1,0
election,2,0
forgettable,0,1
game,0,2
great,0,1
it,1,0
match,0,1


In [7]:
# number of words in each category (we need this sum to normalize)
df_category_sum = df_words_sum.sum()
df_category_sum

Category
Not sports     9
Sports        11
dtype: int64

In [9]:
df_words_sum / df_category_sum

Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.111111,0.181818
but,0.0,0.090909
clean,0.0,0.181818
close,0.111111,0.0
election,0.222222,0.0
forgettable,0.0,0.090909
game,0.0,0.181818
great,0.0,0.090909
it,0.111111,0.0
match,0.0,0.090909


In [8]:
# get conditional probabilities 
test_filter = df_words_sum.index.isin(test_words)
df_words_sum[test_filter] / df_category_sum

Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.111111,0.181818
close,0.111111,0.0
game,0.0,0.181818
very,0.0,0.090909


In [None]:
# this is BAD because there are 0's and that doesnt help us.  
# so we have to try a smoothing trick by adding 1 to each word count 
# trick is called "Laplace Smoothing"


In [10]:
df_words_sum2 = df_words_sum + 1
df_words_sum2

Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,2,3
but,1,2
clean,1,3
close,2,1
election,3,1
forgettable,1,2
game,1,3
great,1,2
it,2,1
match,1,2


In [11]:
df_category_sum2 = df_words_sum2.sum()
df_category_sum2

Category
Not sports    23
Sports        25
dtype: int64

In [12]:
test_filter = df_words_sum2.index.isin(test_words)
df_probability2 = df_words_sum2[test_filter] / df_category_sum2
df_probability2


Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.086957,0.12
close,0.086957,0.04
game,0.043478,0.12
very,0.043478,0.08


In [18]:
df_probability2

Category,Not sports,Sports
word,Unnamed: 1_level_1,Unnamed: 2_level_1
a,0.086957,0.12
close,0.086957,0.04
game,0.043478,0.12
very,0.043478,0.08


In [15]:
# need to multiply all the probabilities to see which on is the highest


sports = 1
non_sports = 1

for ii,rr in df_probability2.iterrows():
    print(ii)
    print(rr)
    sports *= rr['Sports']
    non_sports *= rr['Not sports']
    
print(sports, non_sports)

a
Category
Not sports    0.086957
Sports        0.120000
Name: a, dtype: float64
close
Category
Not sports    0.086957
Sports        0.040000
Name: close, dtype: float64
game
Category
Not sports    0.043478
Sports        0.120000
Name: game, dtype: float64
very
Category
Not sports    0.043478
Sports        0.080000
Name: very, dtype: float64
4.607999999999999e-05 1.4293831139825827e-05


In [None]:
# so we see that NB guesses this is a sport sentence!!


In [14]:
sports /(sports + non_sports), non_sports /(sports + non_sports), 

(0.763244590082062, 0.23675540991793792)

In [None]:

# 0. try "a very close baseball game"
# 1. mess w/ the laplace smoothing number 
# 2. try other sentences
# 3. add some to the training set
