In [10]:
from src.glove import *
glove = GloveEmbeddings("src/glove.6B/glove.6B.200d.txt")

In [11]:
from modules import IO
import numpy as np
data_folder = './data/'
imdb_genres_list = IO(data_folder + 'tmdb_genres_list.pkl').read_pickle()
movies = IO(data_folder + 'tmdb_processed.pkl').read_pickle()

In [12]:
titles = np.array([m['title'] for m in movies])
plots = np.array([m['overview'] for m in movies])
genres = np.array([m['genre_ids'] for m in movies])

In [13]:
len(titles)

160973

In [14]:
np.random.seed(10)
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

# GLOVE

In [15]:
def get_sentence_embedding(weight_vector, freq_matrix, word_map, mean):
    result = np.zeros(glove.emb_size)
    tot_freq = 0
    for key in weight_vector.keys():
        i = key[1]

        if i in __emb_cache:
            emb = __emb_cache[i]
        else:
            word = word_map[i]
            emb = glove.get_embedding(word)
            __emb_cache[i] = emb
        result += emb*weight_vector.get(key)
        tot_freq += freq_matrix.get(key)
    if mean:
        return result/tot_freq
    else:
        return result

def to_sentence_embedding_matrix(weight_matrix, freq_matrix, vectorizer, mean=True):
    result = []
    word_map = vectorizer.get_feature_names()
    for i in range(weight_matrix.shape[0]):
        emb = get_sentence_embedding(weight_vector=weight_matrix[i], freq_matrix=freq_matrix, word_map=word_map, mean=mean)
        result.append(emb)
    return np.array(result)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [17]:
cv = CountVectorizer()
count_train = cv.fit_transform(plots[train_idx])
count_test = cv.transform(plots[test_idx])
__emb_cache = {}

In [None]:
X_train = to_sentence_embedding_matrix(count_train.todok(), count_train.todok(), cv)
X_test = to_sentence_embedding_matrix(count_test.todok(), count_test.todok(), cv)

  app.launch_new_instance()
  app.launch_new_instance()


In [None]:
X_train.shape

# TFIDF

In [154]:
from sklearn.feature_extraction.text import TfidfTransformer

In [155]:
tfidf = TfidfTransformer()

In [156]:
tfidf_train = tfidf.fit_transform(count_train)
tfidf_test = tfidf.transform(count_test)

In [157]:
X_train_tfidf = to_sentence_embedding_matrix(tfidf_train.todok(), count_train.todok(), cv, mean=False)
X_test_tfidf = to_sentence_embedding_matrix(tfidf_test.todok(), count_test.todok(), cv, mean=False)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb=MultiLabelBinarizer()
Y=mlb.fit_transform(genres)

In [None]:
Y_train = Y[train_idx]
Y_test = Y[test_idx]

In [None]:
def accuracy(gt, preds):
    acc = 0
    for i in range(len(gt)):
        if gt[i] == preds[i]:
            acc += 1
            
    return acc / len(gt)
    

In [None]:
def precision_recall(gt,preds):
    TP=0
    FP=0
    FN=0
    for i in range(len(gt)):
        if gt[i] ==1 and preds[i] == 1:
            TP+=1
        elif gt[i] ==0 and preds[i] == 1:
            FN+=1
        elif gt[i] ==1 and preds[i] == 0:
            FP+=1
    if TP+FP==0:
        precision=0
    else:
        precision=TP/float(TP+FP)
    if TP+FN==0:
        recall=0
    else:
        recall=TP/float(TP+FN)
    return precision,recall

In [179]:
def evaluate(model, LabelPowerset = False, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train)
    train_acc = model.score(X_tr,Y_train)
    test_acc = model.score(X_te,Y_test)
    
    if LabelPowerset:
        predictions = model.predict(X_te).toarray()
    else:
        predictions = model.predict(X_te)
        
    precs=[]
    recs=[]
    for i in range(len(Y_test)):
        a,b=precision_recall(Y_test[i],predictions[i])
        precs.append(a)
        recs.append(b)
        acc = accuracy(Y_test[i],predictions[i])

    print(" Precision:{} Recall :{}" .format(np.mean(np.asarray(precs)), np.mean(np.asarray(recs))))
    print("accuracy : {}".format(np.mean(acc)))

In [180]:
# from sklearn.metrics import f1_score
# def evaluate(model, tfidf=False):
#     if tfidf:
#         X_tr = X_train_tfidf
#         X_te = X_test_tfidf
#     else:
#         X_tr = X_train
#         X_te = X_test
    
#     model.fit(X_tr, Y_train_thriller)
#     train_acc = model.score(X_tr,Y_train_thriller)
#     test_acc = model.score(X_te,Y_test_thriller)
    
#     train_f1 = f1_score(Y_train_thriller, model.predict(X_tr))
#     test_f1 = f1_score(Y_test_thriller, model.predict(X_te))
    
#     print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
#     print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import LabelPowerset

# SVM

In [None]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC

In [None]:
svm = OneVsRestClassifier(LinearSVC())

In [None]:
evaluate(svm)

In [185]:
evaluate(svm, tfidf=True)

 Precision:0.6142916666666667 Recall :0.5780238095238095
accuracy : 0.55


In [126]:
svm2 = LabelPowerset(SVC(kernel='linear'))
evaluate(svm2, LabelPowerset=True)
evaluate(svm2, LabelPowerset=True, tfidf= True)

 Precision:0.5066607142857142 Recall :0.5864999999999999
 Precision:0.6593630952380952 Recall :0.7204880952380952


In [140]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer

parameters = {'kernel':['linear'], 'C':[0.01, 0.1, 1.0, 10]}
gridCV = GridSearchCV(SVC(class_weight='balanced'), parameters, scoring=make_scorer(f1_score, average='micro'))
ovr = OneVsRestClassifier(gridCV)

In [141]:
evaluate(ovr, tfidf=True)

 Precision:0.7496488095238095 Recall :0.6170476190476191


In [142]:
lps = LabelPowerset(gridCV)
evaluate(lps, tfidf=True, LabelPowerset=True)



 Precision:0.6593630952380952 Recall :0.7204880952380952


In [24]:
#!git commit -a -m "update tfidf"

[master 5d60b14] update tfidf
 1 file changed, 21 insertions(+), 21 deletions(-)


The file will have its original line endings in your working directory.


In [25]:
#!git push

To github.com:xuwd11/Movie_Classification.git
   bf63c30..5d60b14  master -> master
