In [7]:
import numpy as np
from scipy.sparse import coo_matrix
from itertools import combinations
from collections import Counter
import networkx as nx
import tensorflow as tf

In [5]:
%%time

print ('Data Loading...')
data_loader = np.load('../data_processed/filtered_matrix/filtered_actin_data_for_clustering.npz')
dense_matrix = coo_matrix((data_loader['data'], (data_loader['row'], data_loader['col'])), shape=data_loader['shape'], dtype=int).toarray()
print ('Data Loading done...')

print ('Create Counter...')

edges = Counter()

for idx_movie in range(dense_matrix.shape[1]):
    movie = dense_matrix[:, idx_movie]
    idx_actors = np.where(movie == 1)[0]
    for comb in combinations(idx_actors, 2):
        edges[comb] += 1

print ('Write to edges file...')

with open('../data_processed/edges/edges.txt', 'w') as fs:
    for edge in edges.elements():
        if edges[edge] >= 3:
            fs.write('%d,%d,%d\n' % (edge[0], edge[1], edges[edge]))

Data Loading...
Data Loading done...
Create Counter...
Write to edges file...
CPU times: user 2min 12s, sys: 16.2 s, total: 2min 28s
Wall time: 2min 30s


In [2]:
%%time

G = nx.read_weighted_edgelist('../data_processed/edges/edges.txt', delimiter=',')

clique_generator = nx.enumerate_all_cliques(G)

curr = next(clique_generator)
while len(curr) < 2:
    curr = next(clique_generator)

with open('../data_actorset/fulldb_actorset_2.csv', 'w') as fs:
    while len(curr) == 2:
        fs.write(','.join(curr) + '\n')
        curr = next(clique_generator)

with open('../data_actorset/fulldb_actorset_3.csv', 'w') as fs:
    while len(curr) == 3:
        fs.write(','.join(curr) + '\n')
        curr = next(clique_generator)

CPU times: user 2min 20s, sys: 3min 11s, total: 5min 31s
Wall time: 6min


## 2-actor set

In [3]:
%%time

from multiprocessing import Pool
import csv
import os
import numpy as np

def find_intersect(actorset):
    co_acted_matrix_indices = np.array([], dtype=int).reshape(0,2)
    data_loader = np.load('../data_processed/filtered_matrix/filtered_actin_data_for_clustering.npz')
    row = data_loader['row']
    col = data_loader['col']
    for actors in actorset:
        co_acted = np.intersect1d(col[row==actors[1]], col[row==actors[2]])
        idx_arr = np.full((1, len(co_acted)), actors[0])
        co_acted_matrix_indices = np.concatenate((co_acted_matrix_indices, np.vstack((idx_arr,co_acted)).T))
    return co_acted_matrix_indices

if __name__ == '__main__':   
    import tensorflow as tf

    act_at_least = 3
    top_k = 15

    # casting data
    data_loader = np.load('../data_processed/filtered_matrix/filtered_actin_data_for_clustering.npz')

    # actorset
    with open('../data_actorset/fulldb_actorset_2.csv') as fs:
        reader = csv.reader(fs, delimiter=',')
        actorset = list(reader)
        actorset = [[int(b) for b in a] for a in actorset]

    t_actorset = tf.constant(actorset)
    actorset = [[i] + [int(b) for b in a] for i, a in enumerate(actorset)]

    pool = Pool()
    chunks = np.array_split(actorset, os.cpu_count())
    co_acted_matrix_indices = np.concatenate(pool.map(
        find_intersect,
        chunks
    ))

    # Ratings
    with open('../data_processed/matrix/mat_ratings.csv') as cfs:
        reader = csv.reader(cfs)
        ratings = list(reader)
    t_ratings = tf.constant(ratings)
    t_ratings = tf.string_to_number(ratings)
    t_ratings_mask = tf.ones(t_ratings.shape)

    t_co_acted_matrix = tf.SparseTensor(
        indices=co_acted_matrix_indices, 
        values=tf.ones(co_acted_matrix_indices.shape[0], dtype=tf.float32), 
        dense_shape=(len(actorset), data_loader['shape'][1]))

    t_product = tf.sparse_tensor_dense_matmul(
        t_co_acted_matrix,
        t_ratings
    )

    #count
    t_count = tf.sparse_tensor_dense_matmul(
        t_co_acted_matrix,
        t_ratings_mask
    )

    #filter out
    t_threshold_mask = tf.greater_equal(
        t_count,
        act_at_least
    )

    t_masked_product = tf.boolean_mask(t_product, t_threshold_mask)
    t_masked_count = tf.boolean_mask(t_count, t_threshold_mask)

    #average
    t_average = tf.divide(
        t_masked_product,
        t_masked_count,
    )

    #top_k
    t_top_k_v, t_top_k_i= tf.nn.top_k(
        tf.transpose(t_average),
        k=top_k,
        sorted=False
    )

    #lookup names

    data_loader = np.load('../data_processed/filtered_matrix/filtered_names.npz')
    t_names = tf.constant(data_loader['names'])

    t_actor_ids = tf.reshape(tf.gather(t_actorset, t_top_k_i), [-1])
    t_top_k_names = tf.reshape(tf.gather(
        t_names
        ,t_actor_ids
    ), [-1, int(t_actorset.shape[1])])
    # back to normal: tf.reshape(t, [-1, k])

    with tf.Session() as sess:
        file_writer = tf.summary.FileWriter('/log', sess.graph)
        print (t_top_k_v.eval())
        print (t_top_k_names.eval())


PermissionDeniedError: /log

## 3-actor set

In [3]:
%%time

from multiprocessing import Pool
import csv
import os
import numpy as np
from functools import reduce

def find_intersect(actorset):
    co_acted_matrix_indices = np.array([], dtype=int).reshape(0,2)
    data_loader = np.load('../data_processed/filtered_matrix/filtered_actin_data_for_clustering.npz')
    row = data_loader['row']
    col = data_loader['col']
    for actors in actorset:
        co_acted = reduce(np.intersect1d, [col[row==actor] for actor in actors[1:]])
        idx_arr = np.full((1, len(co_acted)), actors[0])
        co_acted_matrix_indices = np.concatenate((co_acted_matrix_indices, np.vstack((idx_arr,co_acted)).T))
    return co_acted_matrix_indices

if __name__ == '__main__':   
    import tensorflow as tf

    act_at_least = 3
    top_k = 15

    # casting data
    data_loader = np.load('../data_processed/filtered_matrix/filtered_actin_data_for_clustering.npz')

    # actorset
    with open('../data_actorset/fulldb_actorset_3.csv') as fs:
        reader = csv.reader(fs, delimiter=',')
        actorset = list(reader)
        actorset = [[int(b) for b in a] for a in actorset]

    t_actorset = tf.constant(actorset)
    actorset = [[i] + [int(b) for b in a] for i, a in enumerate(actorset)]

    pool = Pool()
    chunks = np.array_split(actorset, os.cpu_count())
    co_acted_matrix_indices = np.concatenate(pool.map(
        find_intersect,
        chunks
    ))

    # Ratings
    with open('../data_processed/matrix/mat_ratings.csv') as cfs:
        reader = csv.reader(cfs)
        ratings = list(reader)
    t_ratings = tf.constant(ratings)
    t_ratings = tf.string_to_number(ratings)
    t_ratings_mask = tf.ones(t_ratings.shape)

    t_co_acted_matrix = tf.SparseTensor(
        indices=co_acted_matrix_indices, 
        values=tf.ones(co_acted_matrix_indices.shape[0], dtype=tf.float32), 
        dense_shape=(len(actorset), data_loader['shape'][1]))

    t_product = tf.sparse_tensor_dense_matmul(
        t_co_acted_matrix,
        t_ratings
    )

    #count
    t_count = tf.sparse_tensor_dense_matmul(
        t_co_acted_matrix,
        t_ratings_mask
    )

    #filter out
    t_threshold_mask = tf.greater_equal(
        t_count,
        act_at_least
    )

    t_masked_product = tf.boolean_mask(t_product, t_threshold_mask)
    t_masked_count = tf.boolean_mask(t_count, t_threshold_mask)

    #average
    t_average = tf.divide(
        t_masked_product,
        t_masked_count,
    )

    #top_k
    t_top_k_v, t_top_k_i= tf.nn.top_k(
        tf.transpose(t_average),
        k=top_k,
        sorted=False
    )

    #lookup names

    data_loader = np.load('../data_processed/filtered_matrix/filtered_names.npz')
    t_names = tf.constant(data_loader['names'])

    t_actor_ids = tf.reshape(tf.gather(t_actorset, t_top_k_i), [-1])
    t_top_k_names = tf.reshape(tf.gather(
        t_names
        ,t_actor_ids
    ), [-1, int(t_actorset.shape[1])])
    # back to normal: tf.reshape(t, [-1, k])

    with tf.Session() as sess:
        print (t_top_k_v.eval())
        print (t_top_k_names.eval())



[ 9.625       9.625       9.625       9.625       9.625       9.63333321
  9.625       9.69999981  9.63333321  9.63333321  9.625       9.63333321
  9.63333321  9.63333321  9.63333321]
[[b'Flavin, James' b'Scannell, Frank J.' b'Sullivan, Brick']
 [b'Willingham, Travis' b'Inman, Jeremy' b'Clinkenbeard, Colleen']
 [b'Willingham, Travis' b'Schemmel, Sean' b'Minaguchi, Y\xc3\xbbko']
 [b'Willingham, Travis' b'Boat, David' b'Futterman, Nika']
 [b'Willingham, Travis' b'Schemmel, Sean' b'Cook, Justin (I)']
 [b'Altoft, Michael' b'Bracq, Alexander' b'Banks, Richard (VII)']
 [b'Willingham, Travis' b'Downes, Robin Atkin' b"O'Shaughnessey, Colleen"]
 [b'Billingslea, Beau (I)' b'Baker, Troy (II)' b'Lodge, David (IV)']
 [b'Moore, Justin D.' b'Blakeney, Derek' b'Lucio, Kelly V.']
 [b'Moore, Justin D.' b'Humphrey, Alan' b'Dorsainville, Jetto']
 [b'Willingham, Travis' b'Cox, Chris (I)' b'Mathis III, James']
 [b'Moore, Justin D.' b'Quintana, Grizelda' b'Borek, Tina']
 [b'Moore, Justin D.' b'Messer, Casey'