# Test notebook 3

## Graphe par genre

J'essaie de créer un graph par genre. Pour le moment j'ai une fonction qui prends le tableau `movies` et qui crée une adjacency matrix avec.
Ce graphe pourrait être utile pour étudier une distribution des genres ou pour mapper les nodes selon les genres.
Pour obtenir l'adjacency : `adjacency = genre_graph.make_genre_adjacency(movies)`

In [None]:
%matplotlib inline

import numpy as np
import pandas as pd
from scipy import sparse
import scipy.sparse.linalg
from matplotlib import pyplot as plt
from pyunlocbox import functions, solvers
import pygsp as pg
import networkx as nx
import logging
import json
import sys
import sklearn.manifold 

logging.basicConfig(format='%(asctime)s | %(levelname)s : %(message)s',
                     level=logging.INFO, stream=sys.stdout)

import src.load_data as ldata
import src.genre_graph as genre_graph
import src.test_success as tsuccess

In [None]:
(movies,people) = ldata.load_dataframes();

In [None]:
list_genres=movies[['genres_names','genres']].copy()
list_genres=list_genres.drop_duplicates()
list_genres=list_genres.set_index('genres') 
list_genres

In [None]:
moviesFiltered = ldata.filter_movies_by_years(movies, pd.to_datetime(f'1960-01-01').date(), pd.to_datetime(f'2020-01-01').date())



In [None]:
adjacency = genre_graph.make_genre_adjacency(moviesFiltered)

In [None]:
plt.figure(figsize=(12, 12))
plt.spy(adjacency)
plt.title('adjacency matrix')

In [None]:
#G_test = nx.from_numpy_matrix(adjacency)
#nx.number_connected_components(G_test)
#Gc_test = max(nx.connected_component_subgraphs(G_test), key=len)
#coords_test = nx.spring_layout(G_test,k=0.03)

In [None]:
#labels = moviesFiltered['genres'].iloc[np.sort(nx.nodes(G_test))]

#plt.figure(figsize=(15, 15))

#im=nx.draw_networkx_nodes(G_test, coords_test, node_size=50,node_color=labels, cmap='tab20')

#labels2 = moviesFiltered['budget'].iloc[np.sort(nx.nodes(G_test))]
#im=nx.draw_networkx_nodes(G_test, coords_test,node_size=50, node_color=labels2)

#nx.draw_networkx_edges(G_test, coords_test, alpha=0.5, width=1)
#plt.title('Graph of genres')
#plt.colorbar(im);

## Étude du ROI

Ici j'ai fait un petit histogramme du ROI moyen par genre. C'est intéressant et surprenant de voir que les genre qui rapportent le plus sont l'*horreur* et les *documentaires*.

In [None]:
perGenreROI = genre_graph.compute_ROI_genre(moviesFiltered)

## Visualisation du ROI par genre par décade

In [None]:
plt.figure(figsize = (25,8))
plt.bar(x=perGenreROI.index,height=perGenreROI.loc[:,'ROI_fraction'],tick_label=perGenreROI.loc[:,'genres_names'])
plt.title('Distribution of ROI per genre',fontsize=20)
plt.savefig('distribution_roi_per_genre.eps', format='eps', dpi=1000)
plt.show()

In [None]:
#for startYear in range(1960, 2010):
#    endYear = startYear + 10
#    moviesFiltered = ldata.filter_movies_window_years(movies, startYear, endYear)
#    adjacency = genre_graph.make_genre_adjacency(moviesFiltered)
#    perGenreROI = genre_graph.compute_ROI_genre(moviesFiltered)
#    plt.figure(figsize = (25,8))
#    plt.bar(x=perGenreROI.index,height=perGenreROI.loc[:,'ROI_fraction'],tick_label=perGenreROI.loc[:,'genres_names'])
#    axes = plt.gca()
#    axes.set_ylim([0,300])
#    plt.title(f'Distribution of ROI per genre between {startYear} and {endYear}',fontsize=20)
#    plt.savefig(f'./roi_per_genre/roi_per_genre_{startYear}_{endYear}.png', format='png', dpi=200)

In [None]:
(adjacencyBudget,movies_filtered_by_budget) = ldata.make_budget_based_adjacency(moviesFiltered)

movies2 = movies_filtered_by_budget.copy()
movies2['original_language'] = pd.factorize(movies2['original_language'])[0]
data_kmeans = movies2.loc[:,('original_language','vote_average','vote_count')]
#data_kmeans = movies2.loc[:,('genres','ROI')]
predictedLabels = tsuccess.compute_kmeans(data_kmeans.values, 2)

In [None]:
newAdjacency = tsuccess.reorder_adjacency(adjacencyBudget, predictedLabels)
#newAdjacency = adjacencyBudget
plt.figure(figsize=(12, 12))
plt.spy(newAdjacency)
plt.title('Reordered with Kmeans adjacency matrix of the movies connected by their budgets. The genre and the ROI where used to perform the clustering', fontsize=12)

## Test d'embedding du actor graph sur les coordonnées du budget.

1. Make actor graph
1. Make budget graph
1. Compute coordinates for budget graph
1. Plot graph

In [None]:
(adjacencyBudget,movies_filtered_by_budget) = ldata.make_budget_based_adjacency(moviesFiltered)

In [None]:
features_actors=ldata.create_features(movies_filtered_by_budget,people);

In [None]:
features_movies=ldata.load_features();
adjacencyActorsCrew=ldata.make_adjacency_from_feature_matrix(features_movies)

In [None]:
#adjacencyActorsCrew = np.load('./data/adjacency_actors_crew.npy');

In [None]:
graphActorsCrew = nx.from_numpy_matrix(adjacencyActorsCrew)
graphBudget = nx.from_numpy_matrix(adjacencyBudget)

nodesGc = max(nx.connected_components(graphBudget), key=len)

graphBudget = nx.subgraph(graphBudget, nodesGc)
graphActorsCrew = nx.subgraph(graphActorsCrew, nodesGc)

coordsBudget = nx.spring_layout(graphBudget,k=0.03)

plt.figure(figsize=(20, 20))
labels = movies_filtered_by_budget['revenue'].iloc[np.sort(nx.nodes(graphActorsCrew))]
im=nx.draw_networkx_nodes(graphActorsCrew, coordsBudget, node_size=10, node_color=labels, cmap='PiYG')
nx.draw_networkx_edges(graphActorsCrew, coordsBudget, alpha=0.1, width=0.7)
plt.title('Movies linked by the actors and crew plotted on a manifold given by the budgets')
plt.colorbar(im);
plt.savefig('./pictures/graph_movies_plot_on_budget.png', dpi=300)

In [None]:
adjacency= nx.adjacency_matrix(graphBudget)

# We calculate the number of nodes and edges
n_nodes=adjacency.shape[0]
#n_edges = int(np.count_nonzero(adjacency)/2)

# We calculate the laplacian
degree = np.diag(adjacency.sum(axis=0))
laplacian = degree - adjacency #combinatorial laplacian
e, U = np.linalg.eigh(laplacian)

coordslap = U[:, 1:3] # Laplacian Eigenmaps coords

coordsNxLap = {}
k = 0
for cc in coordsBudget:
    coordsNxLap[cc] = (coordslap.getA())[k]
    k = k+1

im=nx.draw_networkx_nodes(graphActorsCrew, coordsNxLap, node_size=10, node_color=labels, cmap='PiYG')
nx.draw_networkx_edges(graphActorsCrew, coordsNxLap, alpha=0.1, width=0.7)
plt.title('Movies linked by the actors and crew plotted on a manifold given by the budgets')
plt.colorbar(im);

In [None]:
(coordslap.getA())[0]

In [None]:
#G = pg.graphs.Graph(adjacency)
#coordslap = pg.reduction_laplacian_eigenmaps(G, 2);

In [None]:
coordsBudgetGSP = np.zeros((len(coordsBudget),2))
print(coordsBudgetGSP.shape)
print(len(coordsBudget))
j = 0
for i in coordsBudget:
    coordsBudgetGSP[j] = coordsBudget[i]
    j = j+1

graphActorsCrewGSP = pg.graphs.Graph(nx.adjacency_matrix(graphActorsCrew), coords=coordsBudgetGSP)

signal = np.zeros(len(coordsBudget))

signal[920] = 1.5
label= {920:"Dirac"}

#plt.figure(figsize=(20, 20))
#axes = plt.gca()
#pg.plotting.plot_signal(graphActorsCrewGSP, signal, vertex_size=1, ax=axes)

sizes = 2*(signal+1)

plt.figure(figsize=(25, 25))
im=nx.draw_networkx_nodes(graphActorsCrew, coordsBudget, node_size=0.1, node_color=signal, cmap='rainbow', alpha=1)
nx.draw_networkx_edges(graphActorsCrew, coordsBudget, alpha=0.1, width=0.1)
nx.draw_networkx_labels(graphActorsCrew, coordsBudget, labels=label)
plt.title('Movies linked by the actors and crew plotted on a manifold given by the budgets')
plt.colorbar(im);
plt.savefig('./pictures/graph_movies_plot_on_budget_dirac.png', dpi=300)

In [None]:
graphActorsCrewGSP.compute_fourier_basis()
heat_kernel = pg.filters.Heat(graphActorsCrewGSP, tau=100)
filtered_signal = heat_kernel.filter(signal)

#plt.figure(figsize=(30, 30))
#axes = plt.gca()
#pg.plotting.plot_signal(graphActorsCrewGSP, filtered_signal, save_as='./pictures/movies_dirac_on_actor.png', vertex_size=1, ax=axes)

plt.figure(figsize=(25, 25))
im=nx.draw_networkx_nodes(graphActorsCrew, coordsBudget, node_size=0.1, node_color=filtered_signal, cmap='rainbow', alpha=1)
nx.draw_networkx_edges(graphActorsCrew, coordsBudget, alpha=0.1, width=0.1)
plt.title('Movies linked by the actors and crew plotted on a manifold given by the budgets')
plt.colorbar(im);
plt.savefig('./pictures/graph_movies_plot_on_budget_heat_diffused_dirac.png', dpi=300)


In [None]:
sklearn.manifold.spectral_embedding(adjacencyBudget)