### In this project, we will study the various properties of Internet Movie Database (IMDb). In the first part of the project, we will explore the properties of a directed actor/actress network. In the second part of the project, we will explore the properties of an undirected movie network.

In [None]:
import re
import itertools
from multiprocessing import Pool, Manager, Process
import csv
import pickle
import time
import random

def save_obj(obj, name):
    with open('obj/'+ name + '.pkl', 'wb') as f:
        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

def load_obj(name):
    with open('obj/' + name + '.pkl', 'rb') as f:
        return pickle.load(f)
    
processNum = 2


### 1 Actor/Actress network
In this part of the project, we will create the network using the data from the following text files:

 - actor_movies.txt

 - actress_movies.txt

The text files can be downloaded from the following link:

https://ucla.box.com/s/z45q3g5zrpay8b8gtbql6ojaecb7kj2u

In order to create the network in a consistent manner, you will need to do some data preprocessing. The preprocessing consists of 2 parts:

1. Merging the two text files into one and then removing the actor/actress who has acted in less than 10 movies

2. Cleaning the merged text file

The cleaning part is necessary to avoid inconsistency in the network
creation. If you analyze the merged text file, then you will observe
that same movie might be counted multiple times due to the role of the
actor/actress in that movie. For example, we might have

 - Movie X (voice)

 - Movie X (as uncredited)

If you don’t clean the merged text file, then Movie X (voice) and Movie X (as uncredited) will be considered as different movies. Therefore, you will need to perform some cleaning operations to remove inconsistencies of various types.

#### Question 1: Perform the preprocessing on the two text files and report the total number of actors and actresses and total number of unique movies that these actors and actresses have acted in.

In [None]:
start_time = time.time()

filmNumBar = 10

## read actor and actress data from the file

actorFileName = "./project4_data-selected/actor_movies.txt"
actressFileName = "./project4_data-selected/actress_movies.txt"

cast_movie_mod = {}


target = re.compile(r"(?:\s*\([^()]*\))*\s*(\((?:19|20){1}\d{2}\))\s*(?:\([^()]*\)\s*)*")
with open(actorFileName, 'r', encoding='latin-1') as castFD:
    for line in castFD:
        items = line.strip().split("\t\t")
        cast, movies = items[0], items[1:]
        movies = set(filter(None, [target.sub(r' \1', movie) for movie in movies]))
        if len(movies) > filmNumBar - 1:
            cast_movie_mod[cast] = movies

with open(actressFileName, 'r', encoding='latin-1') as castFD:
     for line in castFD:
        items = line.strip().split("\t\t")
        cast, movies = items[0], items[1:]
        movies = set(filter(None, [target.sub(r' \1', movie) for movie in movies]))
        if len(movies) > filmNumBar - 1 :
            cast_movie_mod[cast] = movies
        
print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# def prosMovieName(d, key):
#     d[key] = list(filter(None, [target.sub('', movie) for movie in cast_movie[key]]))

# manager = Manager()
# cast_movie_mod = manager.dict()


# if __name__ == '__main__':
#     p = Pool(5)
#     multiple_results = [p.apply_async(prosMovieName, (cast_movie_mod, key)) for key in cast_movie.keys()]
#     p.close()
#     p.join()


# # cast_movie_mod = {}
# # for key in cast_movie.keys():
# #     cast_movie_mod[key] = list(filter(None, [target.sub('', movie) for movie in cast_movie[key]]))

In [None]:
movieSet = set(movie for movies in cast_movie_mod.values() for movie in movies)
numOfMovie = len(movieSet)
movieID = {movie: movieId for movieId, movie in enumerate(movieSet)}
IDtoMovie = {movieId: movie for movie, movieId in movieID.items()}
castSet = set(cast for cast in cast_movie_mod.keys())
numOfCast = len(castSet)
castID = {cast: castId for castId, cast in enumerate(castSet)}
IDtoCast = {castId: cast for cast, castId in castID.items()}
print(numOfMovie)
print(numOfCast)

In [None]:
##Parallel computation version of convert movie string to ID parallel package

# def IDlizeCast_movie(d, castSet):
#     for cast in castSet:
#         d[castID[cast]] = set(movieID[movie] for movie in cast_movie_mod[cast])


# totalWorkSize = numOfCast
# processSetSize = totalWorkSize // processNum
# castSets = [{} for i in range(processNum)]
# tempCastSet = castSet
# for i in range(processNum - 1):
#     castSets[i] = set(random.sample(tempCastSet, processSetSize))
#     tempCastSet = tempCastSet - castSets[i]
    
# castSets[processNum - 1] = tempCastSet

# manager = Manager()

# if __name__ == '__main__':
#     cast_moviesInId = manager.dict()
#     job = [Process(target=IDlizeCast_movie, args=(cast_moviesInId, castSet)) for castSet in castSets]
#     _ = [p.start() for p in job]
#     _ = [p.join() for p in job]


## Regular method to convert cast_movie dict to ID to accelerate the speed.
cast_moviesInId = {}

for castSet in castSets:
    for cast in castSet:
        cast_moviesInId[castID[cast]] = set(movieID[movie] for movie in cast_movie_mod[cast])

In [None]:
##Parallel computation version of generating the EdgeList via parallel package


# def castIntersect(d, fromIDSet):
#     for fromID in fromIDSet:
#         fromIDMovieSet = cast_moviesInId[fromID]
#         setLen = len(fromIDMovieSet)
#         tempToList = [[toID, len(fromIDMovieSet.intersection(cast_moviesInId[toID]))] for toID in IDtoCast.keys()]
#         castIntersect[fromID] = [[sublist[0], sublist[1]/setLen] for sublist in tempToList if sublist[1] != 0 and sublist[0] != fromID]
# start_time = time.time()   
# totalWorkSize = numOfCast
# processSetSize = totalWorkSize // processNum
# fromIDSets = [{} for i in range(processNum)]
# castIDSet = set(castId for castId in IDtoCast.keys())
# tempCastIDSet = castIDSet

# for i in range(processNum - 1):
#     fromIDSets[i] = set(random.sample(tempCastIDSet, processSetSize))
#     tempCastIDSet = tempCastIDSet - fromIDSets[i]
    
# fromIDSets[processNum - 1] = tempCastIDSet        

# cast_Intersection = manager.dict()
# job = [Process(target=castIntersect, args=(cast_Intersection, fromIDSet)) for fromIDSet in fromIDSets]
# _ = [p.start() for p in job]
# _ = [p.join() for p in job]


## Regular For loop of generating the edgelist
cast_Intersection ={}
for fromIDSet in fromIDSets:
    for fromID in fromIDSet:
        fromIDMovieSet = cast_moviesInId[fromID]
        setLen = len(fromIDMovieSet)
        tempToList = [[toID, len(fromIDMovieSet.intersection(cast_moviesInId[toID]))] for toID in IDtoCast.keys()]
        castIntersect[fromID] = [[sublist[0], sublist[1]/setLen] for sublist in tempToList if sublist[1] != 0 and sublist[0] != fromID]


print("--- %s seconds ---" % (time.time() - start_time))

In [None]:
# ## reverse the movieID to castID dict

# movie_castInId = {movieID : set() for movieID in IDtoMovie.keys()}

# for castID in IDtoCast.keys():
#     for movieID in cast_moviesInId[castID]:
#         movie_castInId[movieID] = movie_castInId[movieID] | set([castID])
        


In [None]:
with open('eggs.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',',
                            quotechar='"', quoting=csv.QUOTE_MINIMAL)
    for fromID in cast_Intersection.keys():
        for toID, weight in cast_Intersection[fromID]:
            spamwriter.writerow([fromID, toID, weight])



In [None]:
print("--- %s seconds ---" % (time.time() - start_time))