In [76]:
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm
import numpy as np

In [2]:
df = pd.read_csv('imdb_dataset.tsv', sep='\t', header=None)

In [3]:
edges = df.to_records(index=False)

In [4]:
#https://pandas.pydata.org/docs/reference/api/pandas.Series.str.extract.html
# I think the simple regex is sufficient
df[2] = df[1].str.extract(r'(\d{4})', expand=True).fillna(10000).astype(int) #FIXME fillna is pretty ugly rn
df = df.rename(columns={0: "actor", 1: "movie", 2: "year"})

In [7]:
print(df[df['year'].isnull()])

Empty DataFrame
Columns: [actor, movie, year]
Index: []


In [8]:
df[1450:1455]

Unnamed: 0,actor,movie,year
1450,"Aamschot, Michael",Honeyz (2007),2007
1451,"Aamund, Asger",Hj?lp krigens ofre (2003) (TV),2003
1452,"Aamundson, John",ODC,10000
1453,Aanaahad,Lahore (2010),2010
1454,"Aanderaa, Torgny Gerhard",Citizen X (2007),2007


In [9]:
actors = df.actor.unique()
movies = df.movie.unique()
print(f"Number of actors is: {actors.size} \nNumber of movies is: {movies.size} \nTotal nodes will be: {actors.size + movies.size}")
print(f"Number of edges is will be: {len(edges)}")

Number of actors is: 2364796 
Number of movies is: 745941 
Total nodes will be: 3110737
Number of edges is will be: 8104335


In [10]:
movies_dict = df.drop(columns='actor').drop_duplicates().set_index('movie').to_dict('index')
movies_tuples_list = [(k, v) for k, v in movies_dict.items()] #ugly but convenient for what networkx expects

In [11]:
movies_tuples_list

[('Nykytaiteen museo (1986)', {'year': 1986}),
 ('Suuri illusioni (1985)', {'year': 1985}),
 ('E.R. Sluts (2003) (V)', {'year': 2003}),
 ('American Pimp (1999)', {'year': 1999}),
 ('Beats, Rhymes & Life: The Travels of a Tribe Called Quest (2011)',
  {'year': 2011}),
 ('Gangsta Rap: The Glockumentary (2007)', {'year': 2007}),
 ('Get It Where You Fit in 1 (2003) (V)', {'year': 2003}),
 ('Ghetto Physics (2010)', {'year': 2010}),
 ('Ghostride the Whip (2008) (V)', {'year': 2008}),
 ('Hip Hop Uncensored Vol. 4: Miami Vice (2002) (V)', {'year': 2002}),
 ('Menace II Society (1993)', {'year': 1993}),
 ('Ozone West 3 (2009) (V)', {'year': 2009}),
 ('Pimpalation: Return of the Trill (2006)', {'year': 2006}),
 ('Planet Rock: The Story of Hip-Hop and the Crack Generation (2011) (TV)',
  {'year': 2011}),
 ('Porndogs: The Adventures of Sadie (2009)', {'year': 2009}),
 ('Rhyme & Reason (1997)', {'year': 1997}),
 ('Scarface: Greatest Hits on DVD (2003) (V)', {'year': 2003}),
 ('Stop Pepper Palmer (20

In [12]:
oriGinal = nx.Graph()
oriGinal.add_nodes_from(actors, bipartite = 0) #attribute bipartite following documentation recommendations. In this case 0 is actors, 1 is movies
print(f"Number of nodes after adding actors is {oriGinal.number_of_nodes()}")
oriGinal.add_nodes_from(movies_tuples_list, bipartite = 1)
print(f"Number of nodes after adding movies is {oriGinal.number_of_nodes()}") #???????
      

Number of nodes after adding actors is 2364796
Number of nodes after adding movies is 3110735


In [13]:
oriGinal.add_edges_from(edges)

In [14]:
G = nx.convert_node_labels_to_integers(oriGinal, label_attribute='original_name')

In [15]:
actor_nodes = {n for n, d in G.nodes(data=True) if d["bipartite"] == 0}
movies_nodes = set(G) - actor_nodes

In [16]:
#TODO asserts
print(f"Number of actor nodes: {len(actor_nodes)}")
print(f"Number of movies nodes: {len(movies_nodes)}")
print(f"Total number of nodes: {len(actor_nodes) + len(movies_nodes)}")

print(f"#Nodes? {oriGinal.number_of_nodes() == G.number_of_nodes()}")
print(f"#Edges? {oriGinal.number_of_edges() == G.number_of_edges()}")

Number of actor nodes: 2364794
Number of movies nodes: 745941
Total number of nodes: 3110735
#Nodes? True
#Edges? True


In [17]:
movies_nodes

{2151046,
 2364796,
 2364797,
 2364798,
 2364799,
 2364800,
 2364801,
 2364802,
 2364803,
 2364804,
 2364805,
 2364806,
 2364807,
 2364808,
 2364809,
 2364810,
 2364811,
 2364812,
 2364813,
 2364814,
 2364815,
 2364816,
 2364817,
 2364818,
 2364819,
 2364820,
 2364821,
 2364822,
 2364823,
 2364824,
 2364825,
 2364826,
 2364827,
 2364828,
 2364829,
 2364830,
 2364831,
 2364832,
 2364833,
 2364834,
 2364835,
 2364836,
 2364837,
 2364838,
 2364839,
 2364840,
 2364841,
 2364842,
 2364843,
 2364844,
 2364845,
 2364846,
 2364847,
 2364848,
 2364849,
 2364850,
 2364851,
 2364852,
 2364853,
 2364854,
 2364855,
 2364856,
 2364857,
 2364858,
 2364859,
 2364860,
 2364861,
 2364862,
 2364863,
 2364864,
 2364865,
 2364866,
 2364867,
 2364868,
 2364869,
 2364870,
 2364871,
 2364872,
 2364873,
 2364874,
 2364875,
 2364876,
 2364877,
 2364878,
 2364879,
 2364880,
 2364881,
 2364882,
 2364883,
 2364884,
 2364885,
 2364886,
 2364887,
 2364888,
 2364889,
 2364890,
 2364891,
 2364892,
 2364893,
 2364894,


In [18]:
print(oriGinal["'t Hoen, Dani?l"])
print(oriGinal["'Kid Niagara' Kallet, Harry"])

print(G[2151046])

nodes_data = G.nodes.data(True)
print(nodes_data[0])

{'Zonde (2010)': {}}
{'Drug Demon Romance (2012)': {}}
{25695: {}, 207265: {}, 457200: {}, 472789: {}, 1009276: {}, 1140040: {}, 1486377: {}, 1751991: {}, 2036767: {}, 2816090: {}, 2313367: {}}
{'bipartite': 0, 'original_name': '$, Homo'}


In [19]:
original_data = oriGinal.nodes.data(True)
print(original_data['To Meet It with Awe (2011)'])

{'bipartite': 1, 'year': 2011}


In [20]:
G.size

<bound method Graph.size of <networkx.classes.graph.Graph object at 0x11f6351b0>>

In [21]:
degree_sequence = sorted((d for n, d in G.degree()), reverse=True)
dmax = max(degree_sequence)

## Question 1
G) Considering only the movies up to year x with x in {1930,1940,1950,1960,1970,1980,1990,2000,2010,2020}, write a function which, given x, computes the average number of movies per actor up to year x. 

In [77]:
def avgMoviesPerActorUpToYear(G, act_nodes, mv_nodes, year):
    movies_up_to_year = {x for x,y in G.nodes(data=True) if y['bipartite'] == 1 and y['year'] <= year}
    print(len(movies_up_to_year))
    nodes_subset = movies_up_to_year.union(act_nodes) 
    print(len(nodes_subset))
    # We have two ways of interpreting the question. One is to consider actors even when they've zero movies, the 
    # other is to consider actors only when they have a non zero counter. Regardless, this is considered later
    subgraph = G.subgraph(nodes_subset)
    assert subgraph.number_of_nodes() == len(nodes_subset)
    
    subgraph_actor_nodes = {n for n, d in subgraph.nodes(data=True) if d["bipartite"] == 0} #in this case it's not necessary because actors are first nodes (in order), but what I said is not a given
    
    degrees = subgraph.degree(nbunch = subgraph_actor_nodes)
    deg_data = pd.DataFrame(degrees)
    print(f"Mean: {deg_data[1].mean()}")
    print(f"Mean2: {deg_data[1].replace(0, np.NaN).mean()}")
    
    #print(a)
    
avgMoviesPerActorUpToYear(G, actor_nodes, movies_nodes, 1930)

71218
2436012
Mean: 0.16629989758093094
Mean2: 6.18496791645697


In [44]:
i = 0
for x, y in G.nodes(): 
    print(f"x is {x}, y is {y.degree()}")
    i+=1
    if i > 20:
        break

TypeError: cannot unpack non-iterable int object

## Question 2
3) Considering only the movies up to year x with x in {1930,1940,1950,1960,1970,1980,1990,2000,2010,2020} and restricting to the largest connected component of the graph. Approximate the closeness centrality for each node. Who are the top-10 actors?

## Question 3
III) Which is the pair of movies that share the largest number of actors?

## Question 4
Build also the actor graph, whose nodes are only actors and two actors are connected if they did a movie together. Answer to the following question:

Which is the pair of actors who collaborated the most among themselves?

### Notes
- [NetworkX docs on bipartite graphs](https://networkx.org/documentation/stable/reference/algorithms/bipartite.html) However, if the input graph is not connected, there are more than one possible colorations. This is the reason why we require the user to pass a container with all nodes of one bipartite node set as an argument to most bipartite functions.
- Networkx uses a dictionary of dictionaries of dictionaries, as specified in the docs. NetworkX uses a “dictionary of dictionaries of dictionaries” as the basic network data structure. This allows fast lookup with reasonable storage for large sparse networks. The keys are nodes so G[u] returns an adjacency dictionary keyed by neighbor to the edge attribute dictionary. A view of the adjacency data structure is provided by the dict-like object G.adj as e.g. for node, nbrsdict in G.adj.items():. The expression G[u][v] returns the edge attribute dictionary itself. A dictionary of lists would have also been possible, but not allow fast edge detection nor convenient storage of edge data.