In [8]:
import pandas as pd
import networkx as nx
import re
# Function to populate a bipartite graph containing actors, movies and their relationships
def populateGraph(filename):
    df = pd.read_csv(filename, sep="\t") # dataframe to read the file

    graph = nx.Graph() # bipartite graph that will contain actors, movies and their relationships

    # dictionaries needed to keep track of the match { actorName : actorGraphID } (same for movies)
    actorDic = {}
    moviesDic = {}
    globalCounter = 0

    # for each file row need to extract the actor and movie + year infos and store them as graph nodes
    for _, row in df.iterrows(): 

        actor = row[0]
        movieRaw = row[1] # the movie has to be splitted in year and movie name
        # extracting the year using a regex, firstly spotting the year region and then the real year
        parenthesesString = re.findall("\(\d{4}.*\)", movieRaw) 
        if(len(parenthesesString) == 1):
            year = re.findall("\d{4}", parenthesesString[0])[0]
        else:
            year = 0
        # adding the actor and movie informations to dictionaries and graph
        if(actor not in actorDic):
            actorDic[actor] = globalCounter
            graph.add_node(globalCounter, bipartite = 0)
            globalCounter += 1
        if(movieRaw not in moviesDic):
            moviesDic[movieRaw] = globalCounter
            graph.add_node(globalCounter, year = year, bipartite = 1)
            globalCounter += 1
        graph.add_edge(actorDic[actor], moviesDic[movieRaw])
        
    
    actorInverseDic =  {v: k for k, v in actorDic.items()}
    moviesInverseDic =  {v: k for k, v in moviesDic.items()}
    
    return (graph, actorDic, actorInverseDic, moviesDic, moviesInverseDic)

In [9]:
(G, _, _, _, _) = populateGraph('imdb_dataset.tsv')