In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from string import digits

In [2]:
def remove_digits(s: str) -> str:
    remove_digits = str.maketrans('', '', digits)
    res = s.translate(remove_digits)
    return res

In [3]:
train = pd.read_csv('data/train.csv')
valid = pd.read_csv('data/valid.csv')

In [4]:
train['quotes'] = train['quotes'].apply(remove_digits)
valid['quotes'] = valid['quotes'].apply(remove_digits)

In [5]:
train.tail()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,emotion,quotes
3426,2769,2770,god,Do you really mean to tell me the only reason ...
3427,3758,3759,books,Books are no more threatened by Kindle than st...
3428,2631,2632,spirituality,"I can almost picture the disciples faces. ""No,..."
3429,3946,3947,books,Books say: She did this because. Life says: Sh...
3430,4036,4037,knowledge,"Always forgive, but never forget, else you wil..."


In [6]:
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2, binary=True)

train_features = vectorizer.fit_transform(train['quotes'])
train_labels = train['emotion']

valid_features = vectorizer.transform(valid['quotes'])
valid_labels = valid['emotion']

In [7]:
model = BernoulliNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy:{accuracy_score(valid_labels, valid_preds)}')

               precision    recall  f1-score   support

        books       0.35      0.83      0.50        30
        death       0.38      0.23      0.29        26
        faith       0.60      0.27      0.37        45
          god       0.36      0.34      0.35        29
    happiness       0.46      0.43      0.44        28
inspirational       0.16      0.44      0.23        34
    knowledge       0.56      0.38      0.45        24
         love       0.38      0.08      0.13        38
 motivational       0.33      0.63      0.44        35
     religion       0.20      0.13      0.16        30
      romance       0.40      0.34      0.37        35
      science       0.47      0.26      0.34        34
 spirituality       0.54      0.17      0.26        41

    micro avg       0.34      0.34      0.34       429
    macro avg       0.40      0.35      0.33       429
 weighted avg       0.40      0.34      0.33       429

Accuracy:0.34032634032634035


In [8]:
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2)

train_features = vectorizer.fit_transform(train['quotes'])
train_labels = train['emotion']

valid_features = vectorizer.transform(valid['quotes'])
valid_labels = valid['emotion']

In [9]:
model = MultinomialNB(fit_prior=True)
model.fit(train_features, train_labels)

valid_preds = model.predict(valid_features)
print(classification_report(valid_labels, valid_preds))
print(f'Accuracy: {accuracy_score(valid_labels, valid_preds)}')

               precision    recall  f1-score   support

        books       0.88      0.73      0.80        30
        death       0.32      0.42      0.37        26
        faith       0.57      0.36      0.44        45
          god       0.30      0.48      0.37        29
    happiness       0.45      0.50      0.47        28
inspirational       0.30      0.18      0.22        34
    knowledge       0.38      0.42      0.40        24
         love       0.28      0.21      0.24        38
 motivational       0.53      0.57      0.55        35
     religion       0.23      0.20      0.21        30
      romance       0.35      0.49      0.41        35
      science       0.31      0.44      0.37        34
 spirituality       0.45      0.32      0.37        41

    micro avg       0.40      0.40      0.40       429
    macro avg       0.41      0.41      0.40       429
 weighted avg       0.42      0.40      0.40       429

Accuracy: 0.40093240093240096


In [10]:
test = pd.read_csv('data/test.csv')

In [11]:
test['quotes'] = test['quotes'].apply(remove_digits)

In [12]:
test_features = vectorizer.transform(test['quotes'])
test_labels = test['emotion']

In [13]:
test_preds = model.predict(test_features)
print(classification_report(test_labels, test_preds))
print(f'Accuracy: {accuracy_score(test_labels, test_preds)}')

               precision    recall  f1-score   support

        books       0.77      0.71      0.74        28
        death       0.40      0.59      0.47        29
        faith       0.57      0.34      0.43        38
          god       0.28      0.39      0.32        28
    happiness       0.61      0.53      0.56        38
inspirational       0.13      0.11      0.12        27
    knowledge       0.64      0.56      0.60        41
         love       0.29      0.23      0.26        43
 motivational       0.62      0.48      0.55        31
     religion       0.24      0.17      0.20        35
      romance       0.20      0.33      0.25        30
      science       0.48      0.56      0.51        36
 spirituality       0.30      0.36      0.33        25

    micro avg       0.41      0.41      0.41       429
    macro avg       0.42      0.41      0.41       429
 weighted avg       0.43      0.41      0.41       429

Accuracy: 0.4125874125874126


In [14]:
data = pd.concat((train, valid), axis=0)
vectorizer = CountVectorizer(stop_words=None, lowercase=True,
                             ngram_range=(1, 1), min_df=2)

features = vectorizer.fit_transform(data['quotes'])
labels = data['emotion']

test_features = vectorizer.transform(test['quotes'])
test_labels = test['emotion']

In [15]:
model = MultinomialNB(fit_prior=True)
model.fit(features, labels)

test_preds = model.predict(test_features)
print(classification_report(test_labels, test_preds))
print(f'Accuracy: {accuracy_score(test_labels, test_preds)}')

               precision    recall  f1-score   support

        books       0.85      0.79      0.81        28
        death       0.40      0.55      0.46        29
        faith       0.69      0.29      0.41        38
          god       0.24      0.32      0.27        28
    happiness       0.59      0.45      0.51        38
inspirational       0.13      0.15      0.14        27
    knowledge       0.73      0.46      0.57        41
         love       0.29      0.23      0.26        43
 motivational       0.60      0.48      0.54        31
     religion       0.24      0.14      0.18        35
      romance       0.20      0.37      0.26        30
      science       0.43      0.58      0.49        36
 spirituality       0.21      0.32      0.25        25

    micro avg       0.39      0.39      0.39       429
    macro avg       0.43      0.40      0.40       429
 weighted avg       0.44      0.39      0.40       429

Accuracy: 0.3916083916083916


In [16]:
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib

In [17]:
model = Pipeline([('feature_transformer', vectorizer),
                  ('classifier', model)])

In [18]:
joblib.dump(model, 'data/model.pkl')

['data/model.pkl']