In [21]:
# prototype Naive Bayes on IMDB

In [22]:
from modules import IO
import numpy as np
data_folder = './data/'
imdb_genres_list = IO(data_folder + 'imdb_genres_list.pkl').read_pickle()
movies = IO(data_folder + 'top1000_all.pkl').read_pickle()

In [23]:
titles = np.array([m['imdb_title'] for m in movies])
plots = np.array(["\n".join(m['imdb_plot']) for m in movies])
genres = np.array([m['imdb_genres'] for m in movies])

In [24]:
len(titles)

1000

# BoW

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

In [26]:
cv = CountVectorizer()

In [27]:
np.random.seed(10)
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

In [28]:
X_train = cv.fit_transform(plots[train_idx])
X_test = cv.transform(plots[test_idx])

# TFIDF

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
tfidf = TfidfTransformer()

In [17]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [29]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genres)

In [33]:
Y_train = Y[train_idx]
Y_test = Y[test_idx]

# Naive Bayes

In [174]:
def precision_recall(gt,preds):
    TP=0
    FP=0
    FN=0
    for i in range(len(gt)):
        if gt[i] ==1 and preds[i] == 1:
            TP+=1
        elif gt[i] ==0 and preds[i] == 1:
            FN+=1
        elif gt[i] ==1 and preds[i] == 0:
            FP+=1
    if TP+FP==0:
        precision=0
    else:
        precision=TP/float(TP+FP)
    if TP+FN==0:
        recall=0
    else:
        recall=TP/float(TP+FN)
    return precision,recall

In [201]:
def evaluate(model, LabelPowerset = False, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train)
    train_acc = model.score(X_tr,Y_train)
    test_acc = model.score(X_te,Y_test)
    
    if LabelPowerset:
        predictions = model.predict(X_te).toarray()
    else:
        predictions = model.predict(X_te)
        
    precs=[]
    recs=[]
    for i in range(len(Y_test)):
        a,b=precision_recall(Y_test[i],predictions[i])
        precs.append(a)
        recs.append(b)
    
    #train_f1 = f1_score(Y_train, model.predict(X_tr))
    #test_f1 = f1_score(Y_test, model.predict(X_te))
    
    #print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
    print(" Precision:{} Recall :{}" .format(np.mean(np.asarray(precs)), np.mean(np.asarray(recs))))
    #print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [202]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset

In [203]:
nb = OneVsRestClassifier(MultinomialNB())

In [204]:
evaluate(nb)

 Precision:0.5583095238095238 Recall :0.7802500000000001


In [205]:
evaluate(nb, tfidf=True)

 Precision:0.04041666666666666 Recall :0.08


In [206]:
nb2 = LabelPowerset(MultinomialNB())
evaluate(nb2, LabelPowerset= True)

 Precision:0.40416071428571426 Recall :0.48233333333333334


In [207]:
evaluate(nb2, LabelPowerset=True, tfidf=True)

 Precision:0.2820297619047619 Recall :0.33833333333333326


# SVM

In [208]:
from sklearn.svm import SVC

In [209]:
svm = OneVsRestClassifier(SVC(kernel='linear'))

In [210]:
evaluate(svm)

 Precision:0.5647797619047619 Recall :0.6698333333333334


In [211]:
evaluate(svm, tfidf=True)

 Precision:0.4798928571428571 Recall :0.7661666666666666


In [212]:
svm2 = LabelPowerset(SVC(kernel='linear'))
evaluate(svm2, LabelPowerset=True)
evaluate(svm2, LabelPowerset=True, tfidf= True)

 Precision:0.45868452380952385 Recall :0.5179166666666667
 Precision:0.46948214285714285 Recall :0.5383333333333333


In [170]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
classif = OneVsRestClassifier(gridCV)

In [213]:
evaluate(classif)

 Precision:0.5869940476190476 Recall :0.71675
