In [13]:
import re
import numpy as np
import networkx as nx
import random as rnd
import itertools
from networkx.utils.decorators import py_random_state
import bisect
import time
import sys

--------------------------------------
<h3>Creation graph from dataset</h3>
<h5>ID odd -> film <br>
    ID even -> actor</h5>

In [14]:
file=open('original.tsv','r')
EOF=False
id_actor=0
id_film=1
count_actor=0
count_film=1
actor_to_key={}
key_to_actor={}
film_to_key={}
key_to_film={}
G=nx.Graph()
while not EOF:
    line=file.readline().split('\n')
    record=line[0].split('\t')
    if  len(record)<2:
        print(record)
        EOF=True
    else:
        film=record[1]
        year=re.search("\s\([0-9]{4}\)",film)
        actor=record[0]
        if year:
            year=np.int16(year.group(0).replace(" (",'').replace(')',''))
        else:
            year=re.search("\s\([0-9]{4}\/",film)
            if year:
                year=np.int16(year.group(0).replace(" (",'').replace('/',''))
            else:
                year=3000
        if actor not in actor_to_key:
            id_actor=count_actor
            count_actor=count_actor+2
            actor_to_key[actor]=id_actor
            key_to_actor[id_actor]=actor
        else:
            id_actor=actor_to_key[actor]
        if film not in film_to_key:
            id_film=count_film
            count_film=count_film+2
            film_to_key[film]=id_film
            key_to_film[id_film]=(film,year)
        else:
            id_film=film_to_key[film]
        G.add_edge(id_actor,id_film,year=year)
del film_to_key
del actor_to_key
del id_actor
del id_film
del count_actor
del count_film

--------------------------------------
<h3>Which is the movie with the largest number of actors, considering only the movies up to year x?</h3>

In [5]:
x = [1930,1940,1950,1960,1970,1980,1990,2000,2010,2020]
# up_to_year=x[rnd.randint(0,len(x)-1)]
for up_to_year in x:
    time_start=time.time()
    best_result={'count':0,'movies':[]}
    for film,name_year in key_to_film.items():
        if name_year[1]<=up_to_year:
            tot_actors=len(G[film])
            if(tot_actors>=best_result['count']):
                if tot_actors>best_result['count']:
                    best_result['movies']=[]
                    best_result['count']=tot_actors
                best_result['movies'].append(film)
    print('up to year -> '+str(up_to_year))
    print('Number of actors -> '+str(best_result['count']))
    for movie in best_result['movies']:
        print('Film -> '+key_to_film[movie][0])

up to year -> 1930
Number of actors -> 171
Film -> The King of Kings (1927)
up to year -> 1940
Number of actors -> 219
Film -> The Buccaneer (1938)
up to year -> 1950
Number of actors -> 290
Film -> Gone to Earth (1950)
up to year -> 1960
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 1970
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 1980
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 1990
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 2000
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 2010
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)
up to year -> 2020
Number of actors -> 1298
Film -> Around the World in Eighty Days (1956)


----------------------------------------------------------
<h3>Considering only the movies up to year x with x in {1930,1940,1950,1960,1970,1980,1990,2000,2010,2020} and restricting to the largest connected component of the graph.<br> 
Compute exactly the diameter of G</h3>


In [6]:
@py_random_state(1)
def two_sweep(Graph, seed):
    # select a random source node
    rnd_node = seed.choice(list(Graph))
    # get the distances to the other nodes
    # node_a,*_ = max(nx.single_source_shortest_path_length(Graph, source).items(),key=lambda k:k[1])
    source= list(nx.single_source_shortest_path_length(Graph, rnd_node))[-1] #____
    # take a node that is (one of) the farthest nodes from the source
    distances_b = list(nx.single_source_shortest_path_length(Graph, source).items())
    # print(distances_b)
    index_start_node=bisect.bisect_left(distances_b,int(distances_b[-1][1]/2),key=lambda k:k[1]) #start all nodes at distance/2
    index_end_node=bisect.bisect_right(distances_b,int(distances_b[-1][1]/2),key=lambda k:k[1]) #end 
    # best_node=max(distances[index_start_node::index_end_node],key=lambda k:len(H_small[k[0]]))
    # node_b,*_=max(distances_b.items(),key=lambda k:k[1])
    # node_b=distances_b[-1]
    return max(distances_b[index_start_node::index_end_node],key=lambda k:len(H_small[k[0]])) #highest degree

In [7]:
def iFUB(G,node_start):
    i=nx.eccentricity(G,node_start)
    lb=i
    ub=2*i
    # distances=nx.single_source_shortest_path_length(G,node_start,cutoff=i)
    list_fringe=itertools.groupby(reversed(nx.single_source_shortest_path_length(G,node_start,cutoff=i).items()),key=lambda k:k[1])
    max_ecc=0
    while ub>lb:
        for _,group in list_fringe:
            group=list(group)
            for element in group:
                ecc=nx.eccentricity(G,element).get(element[0])
                # print(str(ecc)+';'+str(2*(i-1)))
                if ecc>(2*(i-1)):
                    return ecc
                if ecc>max_ecc:
                    max_ecc=ecc
            break
        lb=max(lb,max_ecc)
        ub=2*(i-1)
        i=i-1
    # print(str(ub)+';'+str(lb))
    return lb

In [8]:
x = [1930,1940,1950,1960,1970,1980,1990,2000,2010,2020]
# year=2000
for year in x:
    time_start=time.time()
    list_nodes=set()
    for u,v in G.edges():
        if G[u][v]['year'] <= year:
            list_nodes.add(u)
            list_nodes.add(v)
    largest_cc=max(nx.connected_components(G.subgraph(list_nodes)),key=len)
    H_small=G.subgraph(largest_cc)
    del(largest_cc)
    del(list_nodes)
    print('Only films up to '+str(year))
    node_start=two_sweep(H_small,None)
    # print(node_start)
    # print('start iFUB')
    print(node_start)
    diameter=iFUB(H_small,node_start[0])
    time_end=time.time()
    print('Diameter -> '+str(diameter))
    print('Time -> '+str(time_end-time_start))

Only films up to 1930
(2039584, 18)
Diameter -> 32
Time -> 53.97123384475708
Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\interactiveshell.py", line 3397, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\franc\AppData\Local\Temp\ipykernel_17736\1494886860.py", line -1, in <cell line: 3>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\interactiveshell.py", line 1992, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_traceback(
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\ultratb.py", line 1012, in structured_traceback
    return VerboseTB.structured_traceback(
  File "c:\Users\franc\De

--------------------------------------------------
<h3>Which is the movie with the largest number of popular actors, i.e. such that the sum of the number of movies its actors participated in is maximum? For example movie M contains actors A and B. A participated in 5 movies, B in 3. The score of M is 8.</h3>

In [None]:
largest_movie={'count':0,'movies':[]}  
for film,_ in key_to_film.items():
    edges=G.edges(film)
    count=sum(G.degree(act) for _,act in G.edges(film))
    if count>=largest_movie['count']:
        if count>largest_movie['count']:
            largest_movie['count']=count
            largest_movie['movies']=[]
        largest_movie['movies'].append(film)
key_to_film[largest_movie['movies']]
largest_movie

100000
200000
300000
400000
500000
600000
700000


{'count': 34181, 'movies': [254783]}

-------------------------------------------
<h3>Build also the actor graph, whose nodes are only actors and two actors are connected if they did a movie together.<br>
Which is the pair of actors who collaborated the most among themselves?</h3>


In [None]:
#ordered list
def create_graph_actors_ordered(): 
    collaborations={}
    best_collaborators=[0,[]]
    count=0
    G_actor=nx.Graph()
    for actor_1 in key_to_actor:
        list_films=itertools.chain(G[actor_1])
        count+=1
        for film in list_films:
            list_actor_for_film=itertools.chain(G[film])
            for actor_2 in list_actor_for_film:
                if actor_2>actor_1: # avoid duplication 
                    if actor_2 in collaborations:
                        collaborations[actor_2]+=1
                        if collaborations[actor_2]>=best_collaborators[0]:
                            if collaborations[actor_2]>best_collaborators[0]:
                                best_collaborators[0]=collaborations[actor_2]
                                best_collaborators[1]=[]
                            best_collaborators[1].append(actor_2)
                    else:
                        collaborations[actor_2]=1
        for element in (collaborations.items()): # pick max items and add it as first/last edge
            if element[0] not in best_collaborators[1]:
                G_actor.add_edge(actor_1,element[0],weight=element[1])
        collaborations.clear()
        for element in itertools.chain(best_collaborators[1]):
            # print(element)
            G_actor.add_edge(actor_1,element,weight=best_collaborators[0])
        best_collaborators=[0,[]]
        if(count%100_000==0):
            print(count)
    return G_actor

In [None]:
G_actor=create_graph_actors_ordered()

100000
200000
300000
400000
500000
600000
700000
800000
900000
1000000
1100000
1200000
1300000
1400000
1500000
1600000
1700000
1800000
1900000
2000000
2100000
2200000
2300000


In [12]:
# G_actor ordered
best_result={'count':0,'pairs':[]} #3:35 min
count=0
jump=0
# table={'list':[],'search':[]}
for actor in itertools.chain(G_actor):
    count+=1
    for actor_2 in itertools.chain(reversed(list(G_actor[actor]))):
        colls=G_actor[actor][actor_2]['weight']
        if colls>=best_result['count']:
            if colls>best_result['count']:
                best_result['count']=colls
                best_result['pairs']=[]
            best_result['pairs'].append((actor,actor_2))
        else:
            jump+=1
            break
    if count%100_000==0:
        print(count)
best_result

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\interactiveshell.py", line 3397, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\franc\AppData\Local\Temp\ipykernel_17736\4083202141.py", line -1, in <cell line: 6>
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\interactiveshell.py", line 1992, in showtraceback
    stb = self.InteractiveTB.structured_traceback(
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\ultratb.py", line 1118, in structured_traceback
    return FormattedTB.structured_traceback(
  File "c:\Users\franc\Developer\venvs\3.10-general\lib\site-packages\IPython\core\ultratb.py", line 1012, in structured_traceback
    return VerboseTB.structured_traceback(
  File "c:\Users\franc\De