In [1]:
import pandas as pd
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix

In [2]:
df = pd.read_csv('simpsons_dataset.csv')
df.head(10)

Unnamed: 0,raw_character_text,spoken_words
0,Miss Hoover,"No, actually, it was a little of both. Sometim..."
1,Lisa Simpson,Where's Mr. Bergstrom?
2,Miss Hoover,I don't know. Although I'd sure like to talk t...
3,Lisa Simpson,That life is worth living.
4,Edna Krabappel-Flanders,The polls will be open from now until the end ...
5,Martin Prince,I don't think there's anything left to say.
6,Edna Krabappel-Flanders,Bart?
7,Bart Simpson,Victory party under the slide!
8,,
9,Lisa Simpson,Mr. Bergstrom! Mr. Bergstrom!


In [3]:
dfa = df.loc[(df['raw_character_text'] == 'Lisa Simpson') | (df['raw_character_text'] == 'Bart Simpson')]
dfa.head(3)

Unnamed: 0,raw_character_text,spoken_words
1,Lisa Simpson,Where's Mr. Bergstrom?
3,Lisa Simpson,That life is worth living.
7,Bart Simpson,Victory party under the slide!


In [4]:
text = dfa['spoken_words'].values.astype('U')
vect = CountVectorizer(stop_words='english')
vect = vect.fit(text)

In [5]:
matrix = vect.transform(text)
matrix

<25248x14258 sparse matrix of type '<class 'numpy.int64'>'
	with 99014 stored elements in Compressed Sparse Row format>

In [6]:
X = matrix
y = dfa['raw_character_text']
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X, y, test_size=0.3)

In [7]:
clf = MultinomialNB()
clf.fit(X_train_l, y_train_l)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [18]:
prediction = clf.predict(X_test_l)

In [19]:
clf.score(X_test_l, y_test_l)

0.6418481848184818

In [31]:
cm = confusion_matrix(y_test_l,prediction)
cm = pd.DataFrame(cm, columns=['Bart','Lisa'], index=['Bart','Lisa'])
cm

Unnamed: 0,Bart,Lisa
Bart,3258,897
Lisa,1816,1604


### Precision & Recall

In [45]:
accuracy = (3258 + 1604) / (3258 + 897 + 1816 + 1604)
accuracy

0.6418481848184818

In [50]:
precision = 1064 / (897 + 1604)
precision

0.6420969649191959

In [51]:
precisionb = 3258 / (3258 + 1816)
precisionb

0.6420969649191959

In [36]:
recall = 1604 / (1816 + 1604)
recall

0.46900584795321637

In [53]:
recallb = 897 / (3258 + 897)
recallb

0.21588447653429602

In [71]:
from sklearn.metrics import classification_report
print(classification_report(y_test_l, prediction, clf.classes_))

              precision    recall  f1-score   support

Bart Simpson       0.64      0.78      0.71      4155
Lisa Simpson       0.64      0.47      0.54      3420

 avg / total       0.64      0.64      0.63      7575



### Probabilities text class

In [93]:
print(dfa.iloc[0,1])
clf.predict_proba(X[0])

Where's Mr. Bergstrom?


array([[0.03728865, 0.96271135]])

In [101]:
for i in range(10):
    prob = clf.predict_proba(X[i])
    print(f'Line: {i}. {dfa.iloc[i,1]}')
    print(f'Bart: {prob[0,0]}, Lisa: {prob[0,1]}')

Line: 0. Where's Mr. Bergstrom?
Bart: 0.03728865258316438, Lisa: 0.9627113474168353
Line: 1. That life is worth living.
Bart: 0.7736928813205844, Lisa: 0.2263071186794155
Line: 2. Victory party under the slide!
Bart: 0.748564631623513, Lisa: 0.25143536837648806
Line: 3. Mr. Bergstrom! Mr. Bergstrom!
Bart: 0.0012588723584686674, Lisa: 0.9987411276415321
Line: 4. Do you know where I could find him?
Bart: 0.5392288865603936, Lisa: 0.4607711134396061
Line: 5. The train, how like him... traditional, yet environmentally sound.
Bart: 0.056178846157405544, Lisa: 0.9438211538425949
Line: 6. I see he touched you, too.
Bart: 0.35465334339671495, Lisa: 0.6453466566032856
Line: 7. Hey, thanks for your vote, man.
Bart: 0.9724088956805164, Lisa: 0.027591104319485204
Line: 8. Well, you got that right. Thanks for your vote, girls.
Bart: 0.886144248019463, Lisa: 0.11385575198053415
Line: 9. Well, don't sweat it. Just so long as a couple of people did... right, Milhouse?
Bart: 0.7247621729340845, Lisa: 0

scipy.sparse.csr.csr_matrix