In [1]:
from src.glove import *
glove = GloveEmbeddings("../../../glove/glove.6B.50d.txt")

In [2]:
from modules import IO
import numpy as np
data_folder = './data/'
imdb_genres_list = IO(data_folder + 'imdb_genres_list.pkl').read_pickle()
movies = IO(data_folder + 'top1000_all.pkl').read_pickle()

In [3]:
titles = np.array([m['imdb_title'] for m in movies])
plots = np.array(["\n".join(m['imdb_plot']) for m in movies])
genres = np.array([m['imdb_genres'] for m in movies])

In [4]:
len(titles)

1000

In [5]:
np.random.seed(10)
train_idx = np.random.choice(len(plots), replace=False, size=int(len(plots)*0.8))
test_idx = set(range(len(plots))) - set(train_idx)
test_idx = np.array(list(test_idx))

# GLOVE

In [6]:
def get_sentence_embedding(weight_vector, freq_matrix, word_map, mean):
    result = np.zeros(glove.emb_size)
    tot_freq = 0
    for key in weight_vector.keys():
        i = key[1]

        if i in __emb_cache:
            emb = __emb_cache[i]
        else:
            word = word_map[i]
            emb = glove.get_embedding(word)
            __emb_cache[i] = emb
        result += emb*weight_vector.get(key)
        tot_freq += freq_matrix.get(key)
    if mean:
        return result/tot_freq
    else:
        return result

def to_sentence_embedding_matrix(weight_matrix, freq_matrix, vectorizer, mean=True):
    result = []
    word_map = vectorizer.get_feature_names()
    for i in range(weight_matrix.shape[0]):
        emb = get_sentence_embedding(weight_vector=weight_matrix[i], freq_matrix=freq_matrix, word_map=word_map, mean=mean)
        result.append(emb)
    return np.array(result)

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

In [8]:
cv = CountVectorizer()
count_train = cv.fit_transform(plots[train_idx])
count_test = cv.transform(plots[test_idx])
__emb_cache = {}

In [9]:
X_train = to_sentence_embedding_matrix(count_train.todok(), count_train.todok(), cv)
X_test = to_sentence_embedding_matrix(count_test.todok(), count_test.todok(), cv)

In [10]:
X_train.shape

(800, 50)

# TFIDF

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer

In [12]:
tfidf = TfidfTransformer()

In [13]:
tfidf_train = tfidf.fit_transform(count_train)
tfidf_test = tfidf.transform(count_test)

In [14]:
X_train_tfidf = to_sentence_embedding_matrix(tfidf_train.todok(), count_train.todok(), cv, mean=False)
X_test_tfidf = to_sentence_embedding_matrix(tfidf_test.todok(), count_test.todok(), cv, mean=False)

# Only for class 'Thriller'

In [15]:
def get_binary_y(genres, class_name):
    return np.array([class_name in g for g in genres])

In [16]:
Y_train_thriller = get_binary_y(genres[train_idx], 'Thriller')
Y_test_thriller = get_binary_y(genres[test_idx], 'Thriller')

# SVM

In [17]:
from sklearn.svm import LinearSVC

In [18]:
svm = LinearSVC()

In [21]:
from sklearn.metrics import f1_score
def evaluate(model, tfidf=False):
    if tfidf:
        X_tr = X_train_tfidf
        X_te = X_test_tfidf
    else:
        X_tr = X_train
        X_te = X_test
    
    model.fit(X_tr, Y_train_thriller)
    train_acc = model.score(X_tr,Y_train_thriller)
    test_acc = model.score(X_te,Y_test_thriller)
    
    train_f1 = f1_score(Y_train_thriller, model.predict(X_tr))
    test_f1 = f1_score(Y_test_thriller, model.predict(X_te))
    
    print("Acc: {} (train)   {} (test)".format(train_acc, test_acc))
    print("F1: {} (train)   {} (test)".format(train_f1, test_f1))

In [22]:
evaluate(svm)

Acc: 0.79125 (train)   0.825 (test)
F1: 0.6890130353817505 (train)   0.6846846846846846 (test)


In [23]:
evaluate(svm, tfidf=True)

Acc: 0.76 (train)   0.76 (test)
F1: 0.5789473684210527 (train)   0.5294117647058824 (test)


In [24]:
!git commit -a -m "update tfidf"

[master 5d60b14] update tfidf
 1 file changed, 21 insertions(+), 21 deletions(-)


The file will have its original line endings in your working directory.


In [25]:
!git push

To github.com:xuwd11/Movie_Classification.git
   bf63c30..5d60b14  master -> master
