In [1]:
# prototype Naive Bayes on TMDB

In [45]:
import numpy as np
from sklearn.metrics import f1_score, precision_score, recall_score

In [2]:
import pickle
tmdb_genres_list = pickle.load(open('data/tmdb_genres_list.pkl', "rb" ))

In [3]:
movies = pickle.load(open('data/tmdb_processed.pkl', "rb" ))

In [4]:
movies = movies[130973:]

In [5]:
k = len(tmdb_genres_list)
k

19

In [6]:
titles = np.array([m['title'] for m in movies])
plots = np.array([(m['overview'].replace(',', '')).replace('.', '') for m in movies])
genres = np.array([m['genre_ids'] for m in movies])

In [7]:
len(titles)

30000

# BoW

In [8]:
from sklearn.feature_extraction.text import CountVectorizer

In [161]:
cv = CountVectorizer(max_df=0.95, min_df=0.003)

In [162]:
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

In [163]:
X_train = cv.fit_transform(plots[train_idx])
X_test = cv.transform(plots[test_idx])

# TFIDF

In [164]:
from sklearn.feature_extraction.text import TfidfTransformer

In [165]:
tfidf = TfidfTransformer()

In [166]:
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [167]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genres)

In [168]:
Y_train = Y[train_idx]
Y_test = Y[test_idx]

In [169]:
X_train_tfidf

<24000x1958 sparse matrix of type '<class 'numpy.float64'>'
	with 764893 stored elements in Compressed Sparse Row format>

# Naive Bayes

In [170]:
def precision_recall(gt,preds):
    TP=0
    FP=0
    FN=0
    for i in range(len(gt)):
        if gt[i] ==1 and preds[i] == 1:
            TP+=1
        elif gt[i] ==0 and preds[i] == 1:
            FN+=1
        elif gt[i] ==1 and preds[i] == 0:
            FP+=1
    if TP+FP==0:
        precision=0
    else:
        precision=TP/float(TP+FP)
    if TP+FN==0:
        recall=0
    else:
        recall=TP/float(TP+FN)
    if TP + FP + FN == 0:
        f1 = 0
    else:
        f1 = 2 * TP / float(2 * TP + FP + FN)
    return precision,recall,f1

In [171]:
def hamming_loss(gt, preds):
    err = 0
    for i in range(len(gt)):
        if (gt[i] ==0 and preds[i] == 1) or (gt[i] ==1 and preds[i] == 0):
            err +=1
    return err

In [172]:
def evaluate(model, LabelPowerset = False, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train)
#     train_acc = model.score(X_tr,Y_train)
#     test_acc = model.score(X_te,Y_test)
    
    if LabelPowerset:
        train_pred = model.predict(X_tr).toarray()
        test_pred = model.predict(X_te).toarray()
    else:
        train_pred = model.predict(X_tr)
        test_pred = model.predict(X_te)

    f1_micro_train = f1_score(Y_train, train_pred, average='micro') 
    f1_micro_test = f1_score(Y_test, test_pred, average='micro') 
    prec_micro_train = precision_score(Y_train, train_pred, average='micro') 
    prec_micro_test = precision_score(Y_test, test_pred, average='micro') 
    recs_micro_train = recall_score(Y_train, train_pred, average='micro') 
    recs_micro_test = recall_score(Y_test, test_pred, average='micro') 
        
#     train_precs=[]
#     train_recs=[]
#     test_precs=[]
#     test_recs=[]
#     train_f1=[]
#     test_f1=[]
# #     train_h_loss = []
# #     test_h_loss = []
    
#     for i in range(len(Y_test)):
#         a,b,c=precision_recall(Y_train[i],train_pred[i])
#         train_precs.append(a)
#         train_recs.append(b)
#         train_f1.append(c)
#         a,b,c=precision_recall(Y_test[i],test_pred[i])
#         test_precs.append(a)
#         test_recs.append(b)
#         test_f1.append(c)
#         train_h_loss.append(hamming_loss(Y_train[i],train_pred[i]))
#         test_h_loss.append(hamming_loss(Y_test[i],test_pred[i]))
    
    #print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
    print(" Training Precision:{} Recall :{} f1 micro score:{}" .format(prec_micro_train, recs_micro_train, f1_micro_train))
    print(" Test Precision:{} Recall :{} f1 micro score:{}" .format(prec_micro_test, recs_micro_test, f1_micro_test))
    #print(" Training Hamming Loss:{} Test Hamming Loss :{}" .format(np.mean(np.asarray(train_h_loss)/k), np.mean(np.asarray(test_h_loss)/k)))
    
    
    #print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [173]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset
from sklearn.ensemble import RandomForestClassifier

In [174]:
nb = OneVsRestClassifier(MultinomialNB())

In [175]:
evaluate(nb)

 Training Precision:0.4022589858674283 Recall :0.6525951557093426 f1 micro score:0.4977223962456829
 Test Precision:0.3718623636224918 Recall :0.5711941872729404 f1 micro score:0.4504621072088725


In [176]:
evaluate(nb, tfidf=True)

 Training Precision:0.7103494623655914 Recall :0.1272152850910185 f1 micro score:0.21578584734733458
 Test Precision:0.6779661016949152 Recall :0.11250439470291808 f1 micro score:0.1929842195195497


In [177]:
nb2 = LabelPowerset(MultinomialNB())
evaluate(nb2, LabelPowerset= True)

 Training Precision:0.6236653183193033 Recall :0.5149390702572589 f1 micro score:0.5641110158876657
 Test Precision:0.5139323731997495 Recall :0.38474159146841674 f1 micro score:0.4400509349239327


In [178]:
evaluate(nb2, LabelPowerset=True, tfidf=True)

 Training Precision:0.6078927396913736 Recall :0.2169098841582669 f1 micro score:0.3197321151372688
 Test Precision:0.5852233676975945 Recall :0.19957810851986404 f1 micro score:0.29764921786244863


In [48]:
rf = OneVsRestClassifier(RandomForestClassifier())

In [49]:
evaluate(rf)

 Precision:0.07410833333333332 Recall :0.0885


In [50]:
evaluate(rf, tfidf=True)

 Precision:0.08425 Recall :0.106


# SVM

In [179]:
from sklearn.svm import LinearSVC

In [180]:
svm = OneVsRestClassifier(LinearSVC())

In [181]:
evaluate(svm)

 Training Precision:0.7838359128757586 Recall :0.5013389499022115 f1 micro score:0.6115393085223519
 Test Precision:0.48796312105173295 Recall :0.33493495839681237 f1 micro score:0.3972202918693537


In [182]:
evaluate(svm, tfidf=True)

 Training Precision:0.7606759009138456 Recall :0.39822476305100046 f1 micro score:0.5227712604179011
 Test Precision:0.6052821128451381 Recall :0.2954412281729755 f1 micro score:0.39707040478815564


In [None]:
svm2 = LabelPowerset(LinearSVC())
evaluate(svm2, LabelPowerset=True)
evaluate(svm2, LabelPowerset=True, tfidf= True)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0]}
gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
classif = OneVsRestClassifier(gridCV)

In [None]:
evaluate(classif)

In [26]:
from sklearn.linear_model import LogisticRegressionCV

In [27]:
lr = OneVsRestClassifier(LogisticRegressionCV())

In [28]:
evaluate(lr)

 Training Precision:0.6572777777777777 Recall :0.7179444444444444
 Test Precision:0.35959444444444444 Recall :0.4159555555555555
 Training Hamming Loss:0.018359649122807017 Test Hamming Loss :0.059219298245614026


In [42]:
svm = OneVsRestClassifier(LinearSVC())
svm.fit(X_train, Y_train)
train_pred = svm.predict(X_train)
test_pred = svm.predict(X_test)

In [31]:
from sklearn.metrics import f1_score
f1_score(Y_train, train_pred, average='weighted')  

0.45951290881205586

In [33]:
f1_score(Y_test, test_pred, average='micro')  

0.35960226805818063

In [42]:
nb = OneVsRestClassifier(MultinomialNB())
nb.fit(X_train_tfidf, Y_train)
train_pred = nb.predict(X_train_tfidf)
test_pred = nb.predict(X_test_tfidf)

In [46]:
precision_score(Y_test, test_pred, average='micro')  

0.5511154694778132

In [None]:
f1_score(Y_test, test_pred, average='micro')  