In [1]:
# prototype Naive Bayes on IMDB

In [2]:
from modules import IO
import numpy as np
data_folder = './data/'
imdb_genres_list = IO(data_folder + 'imdb_genres_list.pkl').read_pickle()
movies = IO(data_folder + 'top1000_all.pkl').read_pickle()

In [3]:
titles = np.array([m['imdb_title'] for m in movies])
plots = np.array(["\n".join(m['imdb_plot']) for m in movies])
genres = np.array([m['imdb_genres'] for m in movies])

In [4]:
len(titles)

1000

# BoW

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

In [6]:
cv = CountVectorizer()

In [7]:
np.random.seed(10)
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

In [8]:
X_train = cv.fit_transform(plots[train_idx])
X_test = cv.transform(plots[test_idx])

# TFIDF

In [9]:
from sklearn.feature_extraction.text import TfidfTransformer

In [10]:
tfidf = TfidfTransformer()

In [11]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# Only for class 'Thriller'

In [12]:
def get_binary_y(genres, class_name):
    return np.array([class_name in g for g in genres])

In [13]:
Y_train_thriller = get_binary_y(genres[train_idx], 'Thriller')
Y_test_thriller = get_binary_y(genres[test_idx], 'Thriller')

# Naive Bayes

In [14]:
def evaluate(model, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train_thriller)
    train_acc = model.score(X_tr,Y_train_thriller)
    test_acc = model.score(X_te,Y_test_thriller)
    
    train_f1 = f1_score(Y_train_thriller, model.predict(X_tr))
    test_f1 = f1_score(Y_test_thriller, model.predict(X_te))
    
    print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
    print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [15]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
nb = MultinomialNB()

In [16]:
evaluate(nb)

Acc: 0.9925 (train)   0.815 (test)
F1: 0.9898648648648649 (train)   0.672566371681416 (test)


In [17]:
evaluate(nb, tfidf=True)

Acc: 0.65875 (train)   0.68 (test)
F1: 0.14420062695924765 (train)   0.0 (test)


  'precision', 'predicted', average, warn_for)


# SVM

In [18]:
from sklearn.svm import SVC

In [19]:
svm = SVC(kernel='linear')

In [20]:
evaluate(svm)

Acc: 1.0 (train)   0.77 (test)
F1: 1.0 (train)   0.6290322580645162 (test)


In [21]:
evaluate(svm, tfidf=True)

Acc: 0.98875 (train)   0.785 (test)
F1: 0.9846153846153847 (train)   0.5742574257425742 (test)
