In [None]:
import pathlib

import numpy as np
import scipy.sparse
import scipy.io
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
import networkx as nx
import utils.preprocess
from sklearn.model_selection import train_test_split

In [None]:
num_ntypes = 3

In [None]:
# load raw data, delete movies with no actor or director
movies = pd.read_csv('../data/MAGNN/IMDB/movie_metadata.csv', encoding='utf-8').dropna(
                axis=0, subset=['actor_1_name', 'director_name']).reset_index(drop=True)

In [None]:
# extract labels, and delete movies with unwanted genres
# 0 for action, 1 for comedy, 2 for drama, -1 for others
labels = np.zeros((len(movies)), dtype=int)
for movie_idx, genres in movies['genres'].iteritems():
    labels[movie_idx] = -1
    for genre in genres.split('|'):
        if genre == 'Action':
            labels[movie_idx] = 0
            break
        elif genre == 'Comedy':
            labels[movie_idx] = 1
            break
        elif genre == 'Drama':
            labels[movie_idx] = 2
            break
unwanted_idx = np.where(labels == -1)[0]
movies = movies.drop(unwanted_idx).reset_index(drop=True)
labels = np.delete(labels, unwanted_idx, 0)

In [None]:
# get director list and actor list
directors = list(set(movies['director_name'].dropna()))
directors.sort()
actors = list(set(movies['actor_1_name'].dropna().to_list() +
                  movies['actor_2_name'].dropna().to_list() +
                  movies['actor_3_name'].dropna().to_list()))
actors.sort()

In [None]:
# build the adjacency matrix for the graph consisting of movies, directors and actors
# 0 for movies, 1 for directors, 2 for actors
dim = len(movies) + len(directors) + len(actors)
type_mask = np.zeros((dim), dtype=int)
type_mask[len(movies):len(movies)+len(directors)] = 1
type_mask[len(movies)+len(directors):] = 2

adjM = np.zeros((dim, dim), dtype=int)
for movie_idx, row in movies.iterrows():
    if row['director_name'] in directors:
        director_idx = directors.index(row['director_name'])
        adjM[movie_idx, len(movies) + director_idx] = 1
        adjM[len(movies) + director_idx, movie_idx] = 1
    if row['actor_1_name'] in actors:
        actor_idx = actors.index(row['actor_1_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_2_name'] in actors:
        actor_idx = actors.index(row['actor_2_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1
    if row['actor_3_name'] in actors:
        actor_idx = actors.index(row['actor_3_name'])
        adjM[movie_idx, len(movies) + len(directors) + actor_idx] = 1
        adjM[len(movies) + len(directors) + actor_idx, movie_idx] = 1

In [None]:
# extract bag-of-word representations of plot keywords for each movie
# X is a sparse matrix
vectorizer = CountVectorizer(min_df=2)
movie_X = vectorizer.fit_transform(movies['plot_keywords'].fillna('').values)
# assign features to directors and actors as the means of their associated movies' features
adjM_da2m = adjM[len(movies):, :len(movies)]
adjM_da2m_normalized = np.diag(1 / adjM_da2m.sum(axis=1)).dot(adjM_da2m)
director_actor_X = scipy.sparse.csr_matrix(adjM_da2m_normalized).dot(movie_X)
full_X = scipy.sparse.vstack([movie_X, director_actor_X])

In [None]:
m_vs_d = adjM[:len(movies), len(movies):len(movies)+len(directors)]
m_vs_a = adjM[:len(movies), len(movies)+len(directors):]

feature_m = full_X[:len(movies), :]
feature_d = full_X[len(movies):len(movies)+len(directors), :]
feature_a = full_X[len(movies)+len(directors):len(movies)+len(directors)+len(actors), :]

In [None]:
np.save('../input/IMDB_processed/m_vs_d.npy', m_vs_d)
np.save('../input/IMDB_processed/m_vs_a.npy', m_vs_a)
np.save('../input/IMDB_processed/labels.npy', labels)

scipy.sparse.save_npz('../input/IMDB_processed/feature_m.npz', feature_m)
scipy.sparse.save_npz('../input/IMDB_processed/feature_d.npz', feature_d)
scipy.sparse.save_npz('../input/IMDB_processed/feature_a.npz', feature_a)