In [1]:
import requests as rq
import regex as re
import json

In [2]:

def collect_rss_movies(genres,countries):

## Query the RSS feed to get 200 latest movies from a list of countries and across different genres ##  
    
    results={}
    rss_url ='https://rss.itunes.apple.com/api/v1/%s/movies/top-movies/%s/200/explicit.json'
    
    for country in countries:
        country_results=[]
        for genre in genres:
            r = rq.get(rss_url % (country,genre)).json()
            genre_results=r['feed']['results']
            country_results+= genre_results
        results[country]=country_results
        
        print('%d movies indexed from %s' %((len(results[country]),country.upper())))
    return results



In [130]:
# Collect 400 movies from US and SG #

genres = ['action-and-adventure','documentary']
countries=['us','sg','my','id','au']
movies_apac_us=collect_rss_movies(genres,countries)


399 movies indexed from US
400 movies indexed from SG
399 movies indexed from MY
400 movies indexed from ID
400 movies indexed from AU


In [3]:
# Gets all movie info from itunes API by ID ## 

def get_info_by_id(movies_dict):
    
    results=[]
    
    for country in movies_dict.keys():
        country_movies=[]
        country_movie_ids=[ a['id'] for a in movies_dict[country]]
        
        for i in country_movie_ids:
            r=rq.get('https://itunes.apple.com/lookup?id=%s&country=%s' % (i,country)).json()
            country_movies += r['results']
            
        results+= country_movies
        
    return results

    
    

In [132]:
# Run the above and dump it in json file # 

apac_us_results=get_info_by_id(movies_apac_us)

with open('data_apac_us_itunes.json', 'w') as outfile:
    json.dump(apac_us_results, outfile)


In [5]:
# Opens local cache of data # 
with open('./data/apac_us_itunes_data.json', 'r') as file:
    d=json.load(file)
    file.close()

apac_us_results=d
    

In [6]:
len(apac_us_results)

1998

In [7]:
def get_search_dat(data):
    
    titles=[]
    years=[]
    itunes_id=[]
    dropped_dat=[]

    
    for a in data:
        if a['wrapperType'] =='track':
            titles.append(a['trackName'])
            years.append(a['releaseDate'][:4])
            itunes_id.append(a['trackId'])
    
    # Drop all "bundle" type movies, eg. sequel bundles # 
        else:
            dropped_dat.append(a['collectionName'])
    print( '%s collection bundles dropped from search' % ((str(len(dropped_dat)))) )
    
    # Parsing - removing spaces, special characters not permitted in search string # 
    
    names_without_bracket_y=[re.sub(r' \([^)]*\)', '',a) for a in titles] 
    names_without_punctuation=[re.sub(r"[^\w\d-'\s]",'',a) for a in names_without_bracket_y]
    names_without_punctuation_double_space=[a.replace('  ',' ') for a in names_without_punctuation]
    search_str=[a.replace(' ','+') for a in names_without_punctuation_double_space]

    seen_ind=[]
    seen=set()
    for i, v in enumerate(search_str):
        if v not in seen:
            seen_ind.append(i)
        seen.add(v)
    
    search_dat={
        
        'titles':[titles[i] for i in seen_ind],
        
        'years':[years[i] for i in seen_ind],
        
        'search_strs':[search_str[i] for i in seen_ind],
        
        'itunes_id':[itunes_id[i] for i in seen_ind]
    
    }
    print( '%s duplicates removed from search' % (str((len(search_str)-len(search_dat['titles'])))))
    return search_dat

In [8]:
search_dat=get_search_dat(apac_us_results)

43 collection bundles dropped from search
761 duplicates removed from search


In [23]:
tmdb_api_key='0a1cfe6a5ac9d8391730241102367452'

def get_TMDB_ids(search_dat,api_key,specify_year=True):
    url="https://api.themoviedb.org/3/search/movie?api_key=%s&query=%s"
    id_list=[]
    exact_matches=multiple_matches=no_matches=bad_response=0
    
    for a in range(len(search_dat['search_strs'])):
        
        if specify_year==True: 
            url_full=url%(api_key,search_dat['search_strs'][a])+'&year='+str(search_dat['years'][a])
        else:
            url_full=url%(api_key,search_dat['search_strs'][a])
            
            
        r=rq.get(url_full)

        if r.status_code==200:
            
            m=r.json()
            TMdb_titles=[a['title'] for a in m['results']]
            TMdb_id=[a['id'] for a in m['results']]

            # Search for exact title match # 
            if search_dat['titles'][a] in TMdb_titles:
                id_list.append(TMdb_id[TMdb_titles.index(search_dat['titles'][a])])
                exact_matches+=1
            # Search for result exact hit #
            elif m['total_results']==1:
                id_list.append(m['results'][0]['id'])
                exact_matches+=1
            # Search for multiple result hit #
            elif m['total_results']>1:
                id_list.append(TMdb_id)
                multiple_matches+=1
            # Search for null result hit #
            elif m['total_results']==0:
                id_list.append(TMdb_id)
                no_matches+=1
        # Record Bad Response #
        else:
            id_list.append(None)
            bad_response+=1

    search_dat['TMdb_id']=id_list
    
    print('Exact Matches = %d, Multiple_matches=%d, No_matches=%d, Bad Response=%d' %(exact_matches,multiple_matches,no_matches,bad_response))
    return search_dat


In [24]:
tmdb_api_key='0a1cfe6a5ac9d8391730241102367452'
dat_with_TMdb_id=get_TMDB_ids(search_dat,tmdb_api_key)


Exact Matches = 866, Multiple_matches=44, No_matches=284, Bad Response=0


In [25]:
# Cache the TMdb search results data in json # 

with open('./data/TMdb_search_results.json', 'w') as f:
    json.dump(dat_with_TMdb_id,f)


In [26]:
# Opens local cache of data # 
with open('./data/TMdb_search_results.json', 'r') as file:
    d=json.load(file)
    file.close()

dat_with_TMdb_id=d


In [28]:
## Subset the data by those with multiple matches, no matches and exact matches ## 

mult_ind=[isinstance(a,list) and len(a)>0 for a in dat_with_TMdb_id['TMdb_id'] ]
multiple_IDs_run1={k:[v for i,v in enumerate(dat_with_TMdb_id[k]) if mult_ind[i]==True] for k in dat_with_TMdb_id.keys()}

no_Ind=[isinstance(a,list) and len(a)==0 for a in dat_with_TMdb_id['TMdb_id'] ]
no_matched_ids_run1={k:[v for i,v in enumerate(dat_with_TMdb_id[k]) if no_Ind[i]==True] for k in dat_with_TMdb_id.keys()}

exact_ind=[isinstance(a,int)for a in dat_with_TMdb_id['TMdb_id'] ]
exact_matches_run1={k:[v for i,v in enumerate(dat_with_TMdb_id[k]) if exact_ind[i]==True] for k in dat_with_TMdb_id.keys()}

print('%.2f percent of queries returned exact matches for TMdb IDs' %float(len(exact_matches_run1['TMdb_id'])*100/len(dat_with_TMdb_id['TMdb_id'])))




72.53 percent of queries returned exact matches for TMdb IDs


In [30]:
# Run 2, relax the "Year" condition and search the TMdb database again ##

no_matches_results=get_TMDB_ids(no_matched_ids_run1,api_key=tmdb_api_key,specify_year=False)

Exact Matches = 147, Multiple_matches=14, No_matches=123, Bad Response=0


In [31]:
# Subset the data again # 

mult_ind=[isinstance(a,list) and len(a)>0 for a in no_matches_results['TMdb_id'] ]
multiple_IDs_run2={k:[v for i,v in enumerate(no_matches_results[k]) if mult_ind[i]==True] for k in no_matches_results.keys()}

no_Ind=[isinstance(a,list) and len(a)==0 for a in no_matches_results['TMdb_id'] ]
no_matched_ids_run2={k:[v for i,v in enumerate(no_matches_results[k]) if no_Ind[i]==True] for k in no_matches_results.keys()}

exact_ind=[isinstance(a,int)for a in no_matches_results['TMdb_id'] ]
exact_matches_run2={k:[v for i,v in enumerate(no_matches_results[k]) if exact_ind[i]==True] for k in no_matches_results.keys()}

num_exact_matches_run2=len(exact_matches_run2['TMdb_id'])+len(exact_matches_run1['TMdb_id'])
percent_exact_match= float(num_exact_matches_run2*100/len(dat_with_TMdb_id['TMdb_id']))

print('%.2f percent of queries returned exact matches for TMdb IDs' % percent_exact_match )


84.84 percent of queries returned exact matches for TMdb IDs


In [32]:
# Function to combine the data that is to be passed through search #

def combine_search_results(results):
    result={}
    keys=list(results[0].keys()) # Set the keys first #
    for k in keys:
        result[k]=[]
    for d in results:
        for k,v in d.items():
            result[k]+= v  # Add the list of data by key #
    return result

# Set up search database for multiple mathces

multi_matched_search_results =[multiple_IDs_run1,multiple_IDs_run2]

matched_google= combine_search_results(multi_matched_search_results)

print ('Need to Google for %d movies that return multiple TMdb hits and %d movies that returned no TMdb hits' %((len(matched_google['TMdb_id']),len(no_matched_ids_run2['TMdb_id']))))


Need to Google for 58 movies that return multiple TMdb hits and 123 movies that returned no TMdb hits


In [33]:
from bs4 import BeautifulSoup

def google_for_imdb_id(search_dat):
    IMDB_ids_list=[]
    successful_hits=0
    no_hits=0
    for a in range(len(search_dat['years'])):
        
        # Google 'Movie Name' + 'imdb' using BS#
        r=rq.get('https://www.google.com/search?q=%s+%s+imdb'% (search_dat['search_strs'][a],search_dat['years'][a] )) 
        p= BeautifulSoup(r.text,'html.parser')
        
        # Take the first search result hyper link # 
        
        first_google_hit=p.find_all('h3', {'class':'r'})[0] 
        m=re.search('title/(.+?)/&',str(first_google_hit))
        
        # If successful, store IMdb ids to a list, if not, store a NaN value #
        if m:
            IMDB_id=m.group(1)
            IMDB_ids_list.append(IMDB_id)
            successful_hits+=1
        else:
            IMDB_ids_list.append('NaN')
            no_hits+=1
    
    print('Successful IMDB_ids_indexed = ' + str(successful_hits) + ' No results = '+ str(no_hits))
    
    # Store in the original dictionary with key 'IMdb_id' # 
    search_dat['IMdb_id']=IMDB_ids_list
    
    return search_dat
    

In [34]:
# Googling for results that are not matched #

google_results_nomatch=google_for_imdb_id(no_matched_ids_run2)

Successful IMDB_ids_indexed = 89 No results = 34


In [37]:
# Googling for results that have multiple matches # 

google_results_multi_match=google_for_imdb_id(matched_google)

Successful IMDB_ids_indexed = 57 No results = 1


In [38]:
# Function for getting IMdb ID from TMdb API # 

def get_imdb_id_from_tmdb_id(search_dat,api_key):
    results=[]
    missed=[]
    bad_response=0
    
    for a in search_dat['TMdb_id']:
        r = rq.get('https://api.themoviedb.org/3/movie/%s?api_key=%s' % (str(a),api_key))
        
        if r.status_code== 200:
            m = r.json()
            results.append(m)
        else:
            print('Bad Response')
            bad_response+=1
            
    print('Number of Bad Responses = ' + str(bad_response))
    return results


In [39]:
# Getting IMdb ID from TMdb api for exact matches # 

exact_matches_results=combine_search_results([exact_matches_run1,exact_matches_run2])

exact_results=get_imdb_id_from_tmdb_id(exact_matches_results,tmdb_api_key)

Number of Bad Responses = 0


In [42]:
# Cache the data in local directory as json #

with open('data_tmdb_exact_matches.json', 'w') as outfile:
    json.dump(exact_results, outfile)
    

In [43]:
# Opens local cache of data # 
with open('data_tmdb_exact_matches.json', 'r') as file:
    d=json.load(file)
    file.close()

exact_results=d
    

In [44]:
# Store the resulting IMdb results as from the TMdb database # 

imdb_ids=[a['imdb_id'] for a in exact_results]

exact_matches_results['IMdb_id']=imdb_ids

# Check for unsuccesful queries for IMdb_id # 

missed_ind=[a=='' for a in exact_matches_results['IMdb_id']]

print('%d missing IMdb IDs from querying TMdb database' %sum(missed_ind) )

26 missing IMdb IDs from querying TMdb database


In [45]:
# Subset the results again #

missed_tmdb_ids={k:[v for i,v in enumerate(exact_matches_results[k]) if missed_ind[i]==True] for k in exact_matches_results.keys()}
leftover_ind= [not i for i in missed_ind]
leftover_exact_matches={k:[v for i,v in enumerate(exact_matches_results[k]) if leftover_ind[i]==True] for k in exact_matches_results.keys()}

In [None]:
# Google for the missing IDs #

google_missed_tmdb_results=google_for_imdb_id(missed_tmdb_ids)

In [47]:
# Merge all the data back to pass through the OMdb API # 

to_merge=[google_missed_tmdb_results,leftover_exact_matches,google_results_multi_match,google_results_nomatch]

OMdb_search_dat=combine_search_results(to_merge)

In [49]:

def get_data_from_omdb(search_dat,api_key,api_key2):
# Searching through the OMdb data base, querying by IMdb ID to get the data from the OMdb database 
# Takes the search data and 2 API keys as arugments
# 2nd API Key is in case the rate limit is reached 

    results=[]
    bad_response=0
    for imdb_id in search_dat['IMdb_id']:
        if imdb_id == None or imdb_id== 'NaN':
            results.append([])
        else:
            r=rq.get('http://www.omdbapi.com/?apikey=%s&i=%s'% (api_key, imdb_id))
            if r.status_code== 200:
                m = r.json()
                results.append(m)
            else:
                r2=rq.get('http://www.omdbapi.com/?apikey=%s&i=%s'%(api_key2, imdb_id))
                if r2.status_code==200:
                    m2=r2.json()
                    results.append(m2)
                else:
                    print('Bad Response for ' + str(imdb_id))
    return results

In [50]:
omdb_api_key='cd67fc6d'
omdb_api_key2='89ae04fd'
OMdb_results=get_data_from_omdb(OMdb_search_dat,omdb_api_key,omdb_api_key2)


In [51]:
# Check for bad results from OMdb database #

missed_ind=[i for i,a in enumerate(OMdb_results) if len(a)==2]

print('%d IMdb IDs returned no results' %len(missed_ind))

12 IMdb IDs returned no results


In [52]:
# Cache the results # 

with open('./data/OMdb_data.json', 'w') as outfile:
    json.dump(OMdb_results,outfile)