# Data preprocessing
- We use tensorflow framework to do some initial filtering and transform data into a sparse matrix.

In [1]:
import csv
import numpy as np
import tensorflow as tf
from scipy.sparse import coo_matrix

In [2]:
%%time

act_at_least = 3

# ratings
with open('../data_processed/matrix/mat_ratings.csv') as cfs:
    reader = csv.reader(cfs)
    ratings = list(reader)
    
t_ratings = tf.constant(ratings)
t_ratings = tf.string_to_number(ratings)
t_ratings_mask = tf.ones(t_ratings.shape)

# actresses
with open('../data_processed/matrix/mat_actresses_names.csv', encoding='ISO-8859-1') as cfs:
    reader = csv.reader(cfs)
    actress_names = list(reader)
    actress_names = [name[0] for name in actress_names]
t_actress_names = tf.constant(actress_names)

actress_loader = np.load('../data_processed/matrix/mat_actresses_actin.npz')
t_actress_actin = tf.SparseTensor(
    indices=actress_loader['actin_pos'],
    values=np.full(len(actress_loader['actin_pos']), 1),
    dense_shape=actress_loader['shape']
)
t_actress_actin = tf.to_float(t_actress_actin)

# actors
with open('../data_processed/matrix/mat_actors_names.csv', encoding='ISO-8859-1') as cfs:
    reader = csv.reader(cfs)
    actor_names = list(reader)
    actor_names = [name[0] for name in actor_names]
t_actor_names = tf.constant(actor_names)

actor_loader = np.load('../data_processed/matrix/mat_actors_actin.npz')
t_actor_actin = tf.SparseTensor(
    indices=actor_loader['actin_pos'],
    values=np.full(len(actor_loader['actin_pos']), 1),
    dense_shape=actor_loader['shape']
)
t_actor_actin = tf.to_float(t_actor_actin)

t_actin_all_gender = tf.sparse_concat(0, [t_actor_actin, t_actress_actin])
t_names_all_gender = tf.concat([t_actor_names, t_actress_names], 0)

# count number of movies acted
t_count = tf.sparse_tensor_dense_matmul(
    t_actin_all_gender,
    t_ratings_mask
)

t_threshold_mask = tf.greater_equal(
    t_count,
    act_at_least
)

t_filtered_names = tf.boolean_mask(t_names_all_gender, tf.map_fn(lambda _: _[0], t_threshold_mask))

with tf.Session() as sess:
    mask_threshold_actors = t_threshold_mask.eval()
    sparse_actin = t_actin_all_gender.eval()
    names = t_filtered_names.eval()

In [3]:
%%time

np.savez('../data_processed/filtered_matrix/filtered_names', names=names)

CPU times: user 122 ms, sys: 35.4 ms, total: 158 ms
Wall time: 160 ms


In [12]:
%%time

data = sparse_actin.values
row = sparse_actin.indices[:, 0]
col = sparse_actin.indices[:, 1]
shape = sparse_actin.dense_shape
dense_matrix = coo_matrix((data, (row, col)), shape=shape, dtype=int).toarray()
filtered = dense_matrix[mask_threshold_actors.T[0]]

sparse_filtered = coo_matrix(filtered)
data = sparse_filtered.data
row = sparse_filtered.row
col = sparse_filtered.col
shape = sparse_filtered.shape
np.savez('../data_processed/filtered_matrix/filtered_actin_data_for_clustering', data=data, row=row, col=col, shape=shape)