In [2]:
import pandas as pd
import numpy as np
import sys
import os
import time
import pickle as pkl
import codecs
import re

from amazon.api import AmazonAPI
from imdb import IMDb
from importlib import reload
import omdb

from IPython.display import clear_output

% matplotlib inline
pd.options.display.max_columns = 30

# 02A - Web Scraping
Factorization machines (FM's) are a newer class of models that are able to consider additional information beyond what is already present in a ratings matrix when predicting ratings for new user-item pairs. Because the ratings data from Amazon contains very little additional information about each item, we need to manually collect relevant data on the movies and TV shows present within the dataset. We look towards three sources for such data:
1. Amazon
2. OMDb (Open Movie Database)
3. IMDb (Internet Movie Database)


In [3]:
data_path_1 = os.path.join('..','..','data')
data_path_2 = os.path.join('..','..','data-2')

df = pd.read_csv(os.path.join(data_path_1, 'reviews_sample_100.csv')).drop(['Unnamed: 0', 'reviewTime'], axis = 1)
df.columns = ['item', 'user', 'rating']
X_user_item = df.drop('rating', axis = 1)

# Amazon

Because the original dataset provides only the ASIN (Amazon Standard Identification Number) for each item, by necessity, we must first query Amazon's database to match each ASIN to a product title. Fortunately, a large wealth of additional attributes, such as actors, studio, release date, etc. can be collected at the same time. This data, while perhaps not as reliable in quality as that of IMDB's or OMDB's, has the advantage of a virtually guaranteed accurate matching to the ASIN, an advantage not available when collecting from IMDB or OMDB as there exists no shared system of movie identification between the latter two sources and Amazon.

In [4]:
aws_key = 'AKIAJHQ5QEGEP3NKAXBA'
aws_secret_key = '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'
aws_associate_tag = 'zhao1701-20'
amazon = AmazonAPI(aws_key=aws_key, aws_secret=aws_secret_key, aws_associate_tag=aws_associate_tag, region='US')

In [5]:
df.sample(5)

Unnamed: 0,item,user,rating
302047,B00008NFR4,ABO2ZI2Y5DQ9T,4.0
771,0790745399,A2A7CI7OWBOI3A,5.0
332657,B00005AFQ9,A2C5LA07JCDWTN,5.0
428302,B002N5N4M6,ANW7ZQSOJQ2A7,5.0
82573,B00003CYLJ,AD0J5KK4WQXNS,3.0


Below, we simply collect all relevant data for every ASIN in the dataset. Any ASIN that continues to return errors after 100 requests is considered failed. Fortunately, only 20 out of the 7000+ ASIN's meet this condition.

In [6]:
# set to True to use previously scraped data
use_prescraped = True

# collecting data for 7000 ASIN's requires approximately 3 hours, so the results were saved as a CSV
if use_prescraped:
    amazon_df = pd.read_csv(os.path.join(data_path_2, 'amazon_data.csv'))
    
else:
    # a stack of ASIN's to be attempted
    to_scrape = list(df['item'].unique())
    
    # collects successfully received item data
    product_list = list()
    
    # collects ASIN's that failed
    fail_list = list()

    try_counter = 0
    while len(to_scrape):
        
        # try up to 100 times to collect data for each ASIN
        try:
            asin = to_scrape[-1]
            if try_counter < 100:
                product_list.append(amazon.lookup(ItemId=asin))
                print(asin)
            else:
                fail_list.append(asin)
                print('failed:', asin)
            to_scrape.pop()
            try_counter = 0
        except:
            print('error:', asin, '| items remaining:', len(to_scrape))
            try_counter += 1
    
    # data is collected in the form of objects with information attributes. such information must be extracted.
    # ...and processed into tabular form
    amazon_data = list()
    for product in product_list:
        row = list()
        row.append(product.title)
        row.append(product.asin)
        row.append(product.actors)
        row.append(product.binding)
        row.append(product.brand)
        row.append(product.directors)
        row.append(product.ean)
        row.append(product.edition)
        row.append(product.formatted_price)
        row.append(product.genre)
        row.append(product.isbn)
        row.append(product.label)
        row.append(product.manufacturer)
        row.append(product.price_and_currency)
        row.append(product.product_group)
        row.append(product.product_type_name)
        row.append(product.publication_date)
        row.append(product.publisher)
        row.append(product.region)
        row.append(product.release_date)
        row.append(product.running_time)
        row.append(product.sales_rank)
        row.append(product.studio)
        row.append(product.upc)
        amazon_data.append(row)
        
        # convert data to tabular format
        amazon_df = pd.DataFrame(amazon_data)
        amazon_df.columns = ['title', 'asin', 'actors', 'binding', 'brand', 'directors', 'ean', 'edition',
                            'formatted_price', 'genre', 'isbn', 'label', 'manufacturer', 'price_and_currency',
                            'product_group', 'product_type_name', 'publication_date', 'publisher', 'region',
                            'release_date', 'running_time', 'sales_rank', 'studio', 'upc']
        amazon_df.to_csv(os.path.join(data_path_2, 'amazon_data.csv'), header=True, index=False, encoding='utf-8')

In [7]:
amazon_df.sample(5)

Unnamed: 0,title,asin,actors,binding,brand,directors,ean,edition,formatted_price,genre,isbn,label,manufacturer,price_and_currency,product_group,product_type_name,publication_date,publisher,region,release_date,running_time,sales_rank,studio,upc
5675,House Of The Dead,B0000YEE6C,"[Jonathan Cherry, Tyron Leitso, Clint Howard, ...",DVD,Lions Gate,[Uwe Boll],12236150000.0,,$2.95,,,Lions Gate,Lions Gate,"(7.99, USD)",DVD,ABIS_DVD,2004-01-01,Lions Gate,US,2004-01-27,90.0,42672.0,Lions Gate,12236150000.0
816,Down in the Valley,B000GDH9NM,"[Edward Norton, Evan Rachel Wood, David Morse,...",DVD,Velocity / Thinkfilm,[David Jacobson],821575500000.0,,$7.05,,,Velocity / Thinkfilm,Velocity / Thinkfilm,"(17.98, USD)",DVD,ABIS_DVD,2006-09-01,Velocity / Thinkfilm,US,2006-09-26,114.0,38799.0,Velocity / Thinkfilm,821575500000.0
292,Exodus (Widescreen) [VHS],6302453224,"[Paul Newman, Eva Marie Saint, Ralph Richardso...",VHS Tape,,[Otto Preminger],9786302000000.0,,$7.94,,792829018.0,MGM (Video & DVD),MGM (Video & DVD),"(7.94, USD)",Video,ABIS_VIDEO,,MGM (Video & DVD),US,1995-02-08,208.0,441264.0,MGM (Video & DVD),27616540000.0
4861,Serpico [VHS],6300216543,"[Al Pacino, John Randolph, Jack Kehoe, Biff Mc...",VHS Tape,,[Sidney Lumet],9780792000000.0,,$3.00,,6300216543.0,Paramount,Paramount,"(3, USD)",Video,ABIS_DVD,,Paramount,US,1998-01-01,130.0,250328.0,Paramount,97360870000.0
4316,The Tarzan Collection Starring Johnny Weissmul...,B0001NBLYA,[Johnny Weissmuller],DVD,WEA,[],9780791000000.0,,$199.99,Action & Adventure,790791269.0,WarnerBrothers,WarnerBrothers,"(199.99, USD)",DVD,ABIS_DVD,2005-11-15,WarnerBrothers,US,2005-11-15,527.0,22768.0,WarnerBrothers,12569600000.0


## Extending data collection to IMDB and OMDB

Unfortunately, even though Amazon provides UPC, ISBN, and EAN codes for each item, IMDB and OMDB do not support querying for such ID's. Furthermore, Amazon's API does not support extracting IMDB ID's. Thus, we must make do with the item titles that have been collected as a means to gather additional data from IMDB and OMDB. Because IMDB and OMDB both use IMDB ID's to uniquely index movies, matching ASIN's to one set of data solves the matching problem for the remaining set.

IMDB's API offers a convenient search functionality that can return mostly accurate results for movie titles regardless of format (ex. *The Matrix* vs. vs. *Matrix, The* vs. *The Matrix (1999)*). However, the downside is that because IMDB often returns multiple results, the first result supplied may sometimes not be the correct one. OMDB on the other hand has stricter requirements for how a title is queried, but includes the option of specifying a release year as well. This suggests that while fewer queries will be made successfully with OMDB, each individual query is more precise. Thus, we query OMDB first using the item title (and possibly year), and if no match is found, we search IMDB for using the item title, accept the first result, and use said result to extract an IMDB ID with which to re-query OMDB.

### Formatting Amazon product titles
Amazon product titles contain much extraneous information within parantheticals and square brackets. To successfully use these titles for searching IMDB we need to remove all such text except for the film or TV series' release year. The release year in the title indicates when the film was released while the release year collected as an attribute indicates when the DVD or VHS was made available. Thus, the title release year is important for searching IMDB, especially when films with the same title have been released over time.

In [8]:
def clean_title_with_year(title):
    "Removes all text between paranthesis and brackets, inclusive except for four-digit year"
    
    # remove all paranthesis except those containing year
    find_parans = re.compile(".*?(\(.*?\))")
    result = re.findall(find_parans, title)
    year_parans = re.compile("\(\d{4}\)")
    for match in result:
        is_year = re.findall(year_parans, match)
        if len(is_year) == 0:
            title = title.replace(match, '')
    
    # remove all brackets
    find_brackets = re.compile(".*?(\[.*?\])")
    result = re.findall(find_brackets, title)
    for match in result:
        title = title.replace(match, '')
        
    # remove all white space padding
    title = title.strip()
    
    return title

In [9]:
amazon_df['title_with_year'] = amazon_df['title'].apply(clean_title_with_year)
amazon_df.title_with_year.sample(10)

7252                      The Godfather
3065                      Frankenweenie
4226                           Timeline
3144                         Crossroads
2225                      Wayne's World
1976                      Chopping Mall
6928                           The Firm
3888    Me and You and Everyone We Know
7232               Your Sister's Sister
1835       Guess Who's Coming to Dinner
Name: title_with_year, dtype: object

While querying with a title that includes release your in paranthesis is crucial for IMDB, searching OMDB requires only the title without the year included. Release year information must be supplied separately to OMDB's interface, so the functions below generate additional features: 1) title without year and 2) release year.

In [10]:
def clean_title_without_year(title):
    "Removes all text between paranthesis and brackets, inclusive except for four-digit year"
    
    # remove all paranthesis
    find_parans = re.compile(".*?(\(.*?\))")
    result = re.findall(find_parans, title)
    for match in result:
        title = title.replace(match, '')
    
    # remove all brackets
    find_brackets = re.compile(".*?(\[.*?\])")
    result = re.findall(find_brackets, title)
    for match in result:
        title = title.replace(match, '')
        
    # remove all white space padding
    title = title.strip()
    
    return title

In [130]:
def extract_title_year(title):
    # return any strings consisting for 4 digits located within a pair of parenthesis
    year_parans = re.compile("\((\d{4})\)")
    year_match = re.findall(year_parans, title)
    if len(year_match) > 0:
        return int(year_match[0])
    # return NAN if product title does not contain release year
    else:
        return np.nan

In [133]:
amazon_df['title_without_year'] = amazon_df['title'].apply(clean_title_without_year)
amazon_df['title_year'] = amazon_df['title'].apply(extract_title_year)
amazon_df[['title_with_year','title_without_year', 'title_year']].sample(5)

Unnamed: 0,title_with_year,title_without_year,title_year
4229,Dracula:Dead and Loving It,Dracula:Dead and Loving It,
4173,Riddick Trilogy,Riddick Trilogy,
3411,Face/Off,Face/Off,
1924,Happythankyoumoreplease,Happythankyoumoreplease,
4773,A.I. Artificial Intelligence,A.I. Artificial Intelligence,


Fix a year mis-labelled by Amazon.

In [13]:
amazon_df.at[1, 'title_year'] = 2014

# OMDB

OMDB's API returns dictionary-like objects, which we store in their entirety before later extracting data from them. We query OMDB with a clean title that does not include the release year, and provide the release year as a separate parameter if it is available. If the query to OMDB fails, then we search IMDB for a match. If such a match is found, we use the match's IMDB ID to query OMDB.

In [14]:
oa = omdb.Client(apikey = '3cdad271')
ia = IMDb()

In [15]:
def collect_from_omdb(df, key):
    oa = omdb.Client(apikey = key)
    ia = IMDb()
    
    # a dictionary of OMDB movie data using ASIN's as keys
    item_dict = dict()
    
    # a list of data for each product unsuccessfully queried
    fail_list = list()
    
    for asin, title, year, title_year in zip(df['asin'], df['title_without_year'],
                                             df['title_year'], df['title_with_year']):
        try:
            # query with title and release year if data is included
            if not np.isnan(year):
                item = oa.get(title=title, year=int(year))
                item_dict[asin] = item
            # otherwise, query only with title
            else:
                item = oa.get(title=title)
                item_dict[asin] = item
            
            # if OMDB query with title/year returns nothing, search IMDB using "title with year"
            if not bool(item):
                print('no omdb match, trying imdb')
                imdb_item = ia.search_movie(title_year)
                
                # if IMDB matches found, retrieve IMDB ID of the first match and use ID to re-query OMDB
                if bool(imdb_item):
                    print('found imdb match')
                    imdb_id = 'tt'+imdb_item[0].getID()
                    item = oa.get(imdbid = imdb_id)
                    item_dict[asin] = item
                else:
                    item_dict[asin] = None
                    print('no imdb match')
            else:
                print('found omdb match')
        
        # if query returns exception, relevant add to fail_list to be attempted later
        except:
            print('failed:', title)
            fail_list.append((asin, title, year, title_year))
        clear_output()
        
    return item_dict, fail_list

While query OMDB's API is a smooth and error-free process, IMDB frequently returns random exceptions. Thus, all products that fail the initial query do so due to IMDB errors. Thus, we re-query IMDB for the failed items, and save all data as a list of OMDB data objects.

In [16]:
use_prescraped = True

if use_prescraped:
    with open(os.path.join(data_path_2, 'omdb_items.list'), 'rb') as file_out:
        omdb_items = pkl.load(file_out)

else:
    # initial queries to OMDB returns a list of failed items
    omdb_items, omdb_fail = collect_from_omdb(amazon_df, key='3cdad271')
    
    # attempt IMDB searches until an error-free query occurs
    item_dict = dict()
    while len(omdb_fail) > 0:
        asin, title, year, title_year = omdb_fail[-1]
        try:
            imdb_item = ia.search_movie(title_year)
            if bool(imdb_item):
                print('found imdb match')
                imdb_id = 'tt'+imdb_item[0].getID()
                item = oa.get(imdbid = imdb_id)
                if bool(item):  
                    item_dict[asin] = item
                else:
                    item_dict[asin] = None
            else:
                item_dict[asin] = None
                print('no imdb match')
            print(asin, title, year)
            omdb_fail.pop()
            clear_output()
        except:
            print('failed')
            pass
    
    # merge data from second attempt with that of initial attempt
    for key, value in item_dict.items():
        omdb_items[key] = value
    
    # save all OMDB data objects as a list
    with open(os.path.join(data_path_2, 'omdb_items.list'), 'wb') as file_out:
        pkl.dump(omdb_items, file_out)

After collecting all the necessary data from OMDB, we convert it into a tabular form and save it as a CSV.

In [193]:
omdb_data = list()
features = ['title', 'actors', 'box_office','country', 'director', 'genre', 'language', 'metascore', 'production',
           'rated', 'released', 'runtime', 'type', 'writer', 'year', 'imdb_id', 'imdb_rating', 'imdb_votes']
for asin, product in omdb_items.items():
    if product:
        row = list()
        row.append(asin)
        keys = product.keys()
        for feature in features:
            if feature in keys:
                row.append(product[feature])
            else:
                row.append(np.nan)
        omdb_data.append(row)
        
omdb_df = pd.DataFrame(omdb_data, columns = ['asin'] + features)
omdb_df.to_csv(os.path.join(data_path_2, 'omdb_data.csv'), header=True, index=False, encoding='utf-8')

In [194]:
omdb_df.sample(5)

Unnamed: 0,asin,title,actors,box_office,country,director,genre,language,metascore,production,rated,released,runtime,type,writer,year,imdb_id,imdb_rating,imdb_votes
4727,6300270122,Superman III,"Christopher Reeve, Richard Pryor, Jackie Coope...",,"Netherlands, UK, USA",Richard Lester,"Action, Comedy, Sci-Fi","English, Italian, Spanish",42.0,WARNER BROTHERS PICTURES,PG,17 Jun 1983,125 min,movie,"Joe Shuster (character created by: Superman), ...",1983,tt0086393,4.9,54373
4218,B001SGEUYW,The Day the Earth Stood Still,"Keanu Reeves, Jennifer Connelly, Kathy Bates, ...","$79,136,963","USA, Canada",Scott Derrickson,"Drama, Sci-Fi, Thriller","English, Mandarin",40.0,20th Century Fox,PG-13,12 Dec 2008,104 min,movie,"David Scarpa (screenplay), Edmund H. North",2008,tt0970416,5.5,147016
4159,B00HNGZHDE,RoboCop,"Peter Weller, Nancy Allen, Dan O'Herlihy, Ronn...",,USA,Paul Verhoeven,"Action, Crime, Sci-Fi",English,67.0,Orion Pictures Corporation,R,17 Jul 1987,102 min,movie,"Edward Neumeier, Michael Miner",1987,tt0093870,7.5,198240
5732,B00005R874,Evolution,"David Duchovny, Julianne Moore, Orlando Jones,...","$37,571,347",USA,Ivan Reitman,"Comedy, Sci-Fi",English,40.0,Dreamworks Pictures,PG-13,08 Jun 2001,101 min,movie,"Don Jakoby (story), David Diamond (screenplay)...",2001,tt0251075,6.1,107068
4136,B0032UYFA6,Breaking Bad Season 3: Silent But Deadly - The...,"Vince Gilligan, Daniel Moncada, Luis Moncada",,USA,Stuart Richardson,"Documentary, Short",English,,,,06 Jun 2011,9 min,movie,,2011,tt2380191,8.4,115


# IMDB
To collect data from IMDB, we first check if an ASIN has a matching IMDB ID from the OMDB collection process. If so, the ID is used to query for an item. Otherwise, a search is conducted using the item's title that includes its release year.

Because saving all IMDB data per ASIN requires 600+MB of space, only relevant features are extracted and saved as a CSV.

In [10]:
use_prescraped = True

if use_prescraped:
    imdb_df = pd.read_csv(os.path.join(data_path_2, 'imdb_data.csv'))
else:

    # a dictionary of IMDB movie data using ASIN's as keys
    imdb_items = dict()
    items_to_collect = list(zip(amazon_df['asin'], amazon_df['title_without_year'], amazon_df['title_year'],
                           amazon_df['title_with_year']))
    
    # counter limiting the number of unsuccessful IMDB queries per ASIN to 100
    attempts = 0
    while(len(items_to_collect)):
        asin, title_without_year, title_year, title_with_year = items_to_collect[-1]
        omdb_item = omdb_items[asin]
        try:
            while attempts < 100:
                # given an ASIN, if no IMDB ID can be found from the OMDB data, search IMDB
                # using the item's title
                if not omdb_item:
                    print('no omdb match, trying imdb', asin)
                    imdb_item = ia.search_movie(title_with_year)
                    if bool(imdb_item):
                        print('found imdb match', asin)
                        imdb_items[asin] = imdb_item[0]
                    else:
                        print('no imdb match', asin)
                        imdb_items[asin] = None
                # given an ASIN, if an IMDB ID can be found from the OMDB data, use the ID
                # to search IMDB for the item
                else:
                    print('found omdb match', asin)
                    imdb_id = omdb_item['imdb_id'].replace('tt','')
                    imdb_items[asin] = ia.get_movie(imdb_id)
                    print(imdb_items[asin]['title'])            
                break
            attempts = 0
            items_to_collect.pop()
            clear_output()
        except:
            attempts += 1
            print('failed attempt {}:'.format(attempts), asin)
            
    # extract relevant features
    imdb_data = list()
    for asin, product in imdb_items.items():
        row = list()
        if product and product.infoset2keys:
            print(product, asin)
            keys = product.infoset2keys['main']
            row.append(product['title'])
            row.append(asin)
            if 'year' in keys:
                row.append(product['year'])
            else:
                row.append(np.nan)
            if 'special effects department' in keys:
                row.append(True)
            else:
                row.append(False)
            if 'genres' in keys:
                row.append(product['genres'])
            else:
                row.append(np.nan)
            if 'runtimes' in keys:
                row.append(product['runtimes'][0])
            else:
                row.append(np.nan)
            if 'production companies' in keys:
                companies = product['production companies']
                pc_list = list()
                for company in companies:
                    pc_list.append(company['name'])
                row.append(pc_list)
            else:
                row.append(np.nan)
            if 'rating' in keys:
                row.append(product['rating'])
            else:
                row.append(np.nan)
            if 'votes' in keys:
                row.append(product['votes'])
            else:
                row.append(np.nan)
            if 'director' in keys:
                directors = product['director']
                dir_list = list()
                for director in directors:
                    dir_list.append(director['name'])
                row.append(dir_list)
            else:
                row.append(np.nan)

            imdb_data.append(row)

        # convert data to tabular format and save as CSV
        imdb_df = pd.DataFrame(imdb_data)
        imdb_df.columns = ['title','asin','year','vfx','genres','runtimes','production_companies',
                           'rating','votes','directors']
        imdb_df.to_csv(os.path.join(data_path_2, 'imdb_data.csv'), header=True, index=False, encoding='utf-8')