# IMDB Person Bio 

In [None]:

import os
import sys
import time
import logging
import numpy as np
import pandas as pd
from imdb import IMDb
import json
from datetime import datetime
# create an instance of the IMDb class
ia = IMDb()


In [None]:
topActors = pd.read_csv('./top100Actors.txt', 
                     sep='\t',
                     header=0, 
                    dtype=str)
print(type(topActors))
topActors.head(15)

In [None]:
actorsDF = topActors[['Rank','First Name','Last Name']]
actorsDF['Full Name']= actorsDF['First Name']+' '+actorsDF['Last Name']
actorsDF = actorsDF.drop(['First Name','Last Name'], axis=1)
actors = actorsDF.values.tolist()

# Search by actor and return biographic data

In [None]:
# search for a person name
def imdbSearchPerson(match_actor):
    persons = ia.search_person(match_actor)
    for person in persons:
        if person['name'] == match_actor:
            actorID = person.getID()
            return actorID
        else:
            print(persons[0])

In [None]:
def imdbPersonBio(actorID):
    personBio=ia.get_person(actorID, info=['biography'])
    personBioDic ={}
    personBioKeys =personBio.infoset2keys

    for key in personBioKeys['biography']:
        try:
            # TODO : clean out special IMDB chars
            personBioDic[key]= personBio.data[key]
        except:
            print('Error adding key :', key)

    return personBioDic


In [None]:
for actor in actors:
    try:
        _actor_id = actor[0]
        actorFullName=actor[1]
        actorID = imdbSearchPerson(actorFullName)
        personBio = imdbPersonBio(actorID)
        personBio.update( {'_actorID': _actor_id,  '_source': 'IMDB', 'actorID': actorID, 'created_utc_dt': datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S")})

        f = open('./data/person/imdb/imdb_p_'+_actor_id+'.json', 'w+')
        f.write(json.dumps(personBio))
        f.close()
    except:
        print('Error',  _actor_id)

# Retrieve Movie info from IMDB 

In [None]:
import os
import sys
import time
from datetime import datetime
import urllib.parse
import http.client
import json
import logging
logger = logging.getLogger('movieInfo')
logger.setLevel(logging.INFO)

In [None]:
path_to_json = './data/cast/themoviedb/'
json_files = [path_to_json+pos_json for pos_json in os.listdir(path_to_json) if pos_json.endswith('.json')]
len(json_files)
df = pd.concat([pd.read_json(f , dtype=str) for f in json_files], ignore_index = True)
df

In [None]:
movieDF = df[['movieID','movieTitle']]
movieDF.drop_duplicates(keep = False, inplace = True) 
movieDF.head()

movieDF.to_csv('./data/movies/movie_list.tsv', sep='\t')

In [None]:
movieDF.to_csv('./data/movies/movie_list.tsv', sep='\t')

In [None]:
from imdb import IMDb
# create an instance of the IMDb class
ia = IMDb()
print(ia.get_movie_infoset())
print(ia.get_person_infoset())
print(ia.get_company_infoset())


# Extract IMDB Movie metadata 

In [None]:
main_attributes= ['genres', 'runtimes', 'countries', 'country codes', 'language codes', 'color info', 'aspect ratio', 'sound mix', 'certificates', 'original air date',  'rating',  'votes',  'cover url',  'plot outline',  'languages', 'title', 'year',  'kind']
person_attributes = ['cast','directors',  'writers',  'producers', 'composers']
company_atttributes = ['production companies',  'distributors','special effects']
object_attributes = set(person_attributes+ company_atttributes )
person_nodes=[]
company_nodes=[]
person_edges=[]
company_edges=[]
user_reviews=[]
from collections import defaultdict
movie_person = defaultdict(list)
movie_companies = defaultdict(list)

for index, row in movieDF.iterrows():
        movieInfo={}
        movieID = row['movieID']
        movieTitle = row['movieTitle']
        movies = ia.search_movie(movieTitle)
        try:
            
            if movies:
                IMDB_movieID = movies[0].movieID
                movie = ia.get_movie(IMDB_movieID)
                movieInfo.update( {'_id': movieID, 'source': 'IMDB', 'created_utc_dt': datetime.utcnow().strftime("%m/%d/%Y, %H:%M:%S")})

                # Return available movie attributes 
                movie_keys= movie.infoset2keys
                movie_keys=set(movie_keys['main'])
                matched_keys = movie_keys.difference(object_attributes)

                #= set(main_attributes).intersection(set(movie_keys['main']))


                for key in matched_keys:
                    if type(movie.data[key]) == list:
                        if str(type(movie.data[key][0])).find('imdb.') != -1:
                            pass
                        else:
                            movieInfo[key]= movie.data[key]
                    else:
                        movieInfo[key]= movie.data[key]


                for company_info in set(movie_keys).intersection(set(company_atttributes)) :
                    for info in movie.data[company_info]:
                        movie_companies[company_info]=  list(set( movie_companies[company_info]+ [info['name']] ))  

                for person_info in set(movie_keys).intersection(set(person_attributes)):
                    for persons in movie.data[person_info]:
                        try:
                            movie_person[person_info] = list(set( movie_person[person_info] + [persons['name']] ))   
                        except:
                            continue
                movieInfo['cast'] = movie_person
                movieInfo['companies'] = movie_companies

                movieInfo['votes']= ia.get_movie(movieID, info=['vote details']).data

                #print(movieInfo)
                movie_file = open('./data/movies/imdb/imdb_m_'+str(movieID)+'.json',"w+")
                movie_file.write(json.dumps(movieInfo))
                movie_file.close()    
                
        except Exception as e:
            logger.error('ERROR processing actor bio: '+ str(e))
