In [1]:
# prototype Naive Bayes on TMDB

In [2]:
import numpy as np

In [3]:
import pickle
tmdb_genres_list = pickle.load(open('data/tmdb_genres_list.pkl', "rb" ))

In [4]:
movies = pickle.load(open('data/tmdb_processed.pkl', "rb" ))

In [5]:
movies = movies[130973:]

In [6]:
k = len(tmdb_genres_list)
k

19

In [7]:
titles = np.array([m['title'] for m in movies])
plots = np.array([m['overview'] for m in movies])
genres = np.array([m['genre_ids'] for m in movies])

In [8]:
len(titles)

30000

# BoW

In [9]:
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
cv = CountVectorizer(max_df=0.95)

In [11]:
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

In [12]:
X_train = cv.fit_transform(plots[train_idx])
X_test = cv.transform(plots[test_idx])

# TFIDF

In [13]:
from sklearn.feature_extraction.text import TfidfTransformer

In [14]:
tfidf = TfidfTransformer()

In [15]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [16]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genres)

In [17]:
Y_train = Y[train_idx]
Y_test = Y[test_idx]

In [18]:
X_train_tfidf

<24000x56241 sparse matrix of type '<class 'numpy.float64'>'
	with 1056188 stored elements in Compressed Sparse Row format>

# Naive Bayes

In [19]:
def precision_recall(gt,preds):
    TP=0
    FP=0
    FN=0
    for i in range(len(gt)):
        if gt[i] ==1 and preds[i] == 1:
            TP+=1
        elif gt[i] ==0 and preds[i] == 1:
            FN+=1
        elif gt[i] ==1 and preds[i] == 0:
            FP+=1
    if TP+FP==0:
        precision=0
    else:
        precision=TP/float(TP+FP)
    if TP+FN==0:
        recall=0
    else:
        recall=TP/float(TP+FN)
    if TP + FP + FN == 0:
        f1 = 0
    else:
        f1 = 2 * TP / float(2 * TP + FP + FN)
    return precision,recall,f1

In [20]:
def hamming_loss(gt, preds):
    err = 0
    for i in range(len(gt)):
        if (gt[i] ==0 and preds[i] == 1) or (gt[i] ==1 and preds[i] == 0):
            err +=1
    return err

In [21]:
def evaluate(model, LabelPowerset = False, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train)
#     train_acc = model.score(X_tr,Y_train)
#     test_acc = model.score(X_te,Y_test)
    
    if LabelPowerset:
        train_pred = model.predict(X_tr).toarray()
        test_pred = model.predict(X_te).toarray()
    else:
        train_pred = model.predict(X_tr)
        test_pred = model.predict(X_te)
        
    train_precs=[]
    train_recs=[]
    test_precs=[]
    test_recs=[]
    train_f1=[]
    test_f1=[]
#     train_h_loss = []
#     test_h_loss = []
    
    for i in range(len(Y_test)):
        a,b,c=precision_recall(Y_train[i],train_pred[i])
        train_precs.append(a)
        train_recs.append(b)
        train_f1.append(c)
        a,b,c=precision_recall(Y_test[i],test_pred[i])
        test_precs.append(a)
        test_recs.append(b)
        test_f1.append(c)
#         train_h_loss.append(hamming_loss(Y_train[i],train_pred[i]))
#         test_h_loss.append(hamming_loss(Y_test[i],test_pred[i]))
    
    #print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
    print(" Training Precision:{} Recall :{} f1 score:{}" .format(np.mean(np.asarray(train_precs)), np.mean(np.asarray(train_recs)), np.mean(np.asarray(train_f1))))
    print(" Test Precision:{} Recall :{} f1 score:{}" .format(np.mean(np.asarray(test_precs)), np.mean(np.asarray(test_recs)), np.mean(np.asarray(test_f1))))
    #print(" Training Hamming Loss:{} Test Hamming Loss :{}" .format(np.mean(np.asarray(train_h_loss)/k), np.mean(np.asarray(test_h_loss)/k)))
    
    
    #print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

In [23]:
nb = OneVsRestClassifier(MultinomialNB())

In [24]:
evaluate(nb)

 Training Precision:0.5067916666666666 Recall :0.5699949613295202 f1 score:0.5156456543456543
 Test Precision:0.3336126984126984 Recall :0.3969970492470493 f1 score:0.3467642292004521


In [25]:
evaluate(nb, tfidf=True)

 Training Precision:0.0266 Recall :0.035166666666666666 f1 score:0.029066666666666664
 Test Precision:0.0036805555555555554 Recall :0.005666666666666667 f1 score:0.004261111111111111


In [26]:
nb2 = LabelPowerset(MultinomialNB())
evaluate(nb2, LabelPowerset= True)

 Training Precision:0.3378388888888889 Recall :0.4088333333333333 f1 score:0.358734126984127
 Test Precision:0.26737222222222223 Recall :0.3411666666666667 f1 score:0.28920317460317463


In [None]:
evaluate(nb2, LabelPowerset=True, tfidf=True)

In [48]:
rf = OneVsRestClassifier(RandomForestClassifier())

In [49]:
evaluate(rf)

 Precision:0.07410833333333332 Recall :0.0885


In [50]:
evaluate(rf, tfidf=True)

 Precision:0.08425 Recall :0.106


# SVM

In [27]:
from sklearn.svm import LinearSVC

In [28]:
svm = OneVsRestClassifier(LinearSVC())

In [29]:
evaluate(svm)

 Training Precision:0.8165 Recall :0.8167083333333334 f1 score:0.8165428571428571
 Test Precision:0.41566587301587304 Recall :0.4220027777777777 f1 score:0.400118253968254


In [30]:
evaluate(svm, tfidf=True)

 Training Precision:0.7553083333333334 Recall :0.7869305555555557 f1 score:0.7652718133718136
 Test Precision:0.3739638888888889 Recall :0.43115277777777783 f1 score:0.3861771645021645


In [None]:
svm2 = LabelPowerset(LinearSVC())
evaluate(svm2, LabelPowerset=True)
evaluate(svm2, LabelPowerset=True, tfidf= True)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
classif = OneVsRestClassifier(gridCV)

In [None]:
evaluate(classif)

In [26]:
from sklearn.linear_model import LogisticRegressionCV

In [27]:
lr = OneVsRestClassifier(LogisticRegressionCV())

In [28]:
evaluate(lr)

 Training Precision:0.6572777777777777 Recall :0.7179444444444444
 Test Precision:0.35959444444444444 Recall :0.4159555555555555
 Training Hamming Loss:0.018359649122807017 Test Hamming Loss :0.059219298245614026
