# IMDb Crawler

#### Import Library

In [350]:
# Library
import pandas as pd
import re
import nltk

from requests import get
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint
from IPython.core.display import clear_output
from collections import Counter

#### Define Additional Function

In [188]:
def short(x):
    lst = []
    for i in x:
        y = i.replace(' ','')
        z = y.lower()
        lst.append(z)
    return lst

In [190]:
class Preprocessor:
    def __init__(self):
        self.stop_words = set(nltk.corpus.stopwords.words('english'))
        self.ps = nltk.stem.PorterStemmer()

    # word tokenize text using nltk lib
    def tokenize(self, text):
        return nltk.word_tokenize(text)

    # stem word using provided stemmer
    def stem(self, word, stemmer):
        return stemmer.stem(word)

    # check if word is appropriate - not a stop word and isalpha, 
    # i.e consists of letters, not punctuation, numbers, dates
    def is_apt_word(self, word):
        return word not in self.stop_words and word.isalpha()

    # combines all previous methods together
    # tokenizes lowercased text and stems it, ignoring not appropriate words
    def preprocess(self, text):
        tokenized = self.tokenize(text.lower())
        return [self.stem(w, self.ps) for w in tokenized if self.is_apt_word(w)]

In [192]:
def build_inverted_index_orig_forms(documents):
    inverted_index = {}
    for no, strings in enumerate(documents):
        s = re.sub(r'([^\w\s])','',strings)
        tokens = nltk.word_tokenize(s.lower())
        file_index = Counter(tokens)
        # update global index
        for term in file_index.keys():
            file_freq = file_index[term]
            if term not in inverted_index:                
                inverted_index[term] = [file_freq, (no, file_freq)]
            else:
                inverted_index[term][0] += file_freq
                inverted_index[term].append((no, file_freq))
    return inverted_index

def generate_wildcard_options(wildcard, k, inverted_index): 
    list_word = []
    for term in inverted_index.keys():
        string = wildcard.replace('*','$')
        pad = '$' + string + '$'
        result = nltk.ngrams(pad, k)
        for i in list(result):
            tri = "".join(i)
            if re.search(tri, term):
                list_word.append(term)
    
    s_wildcard = wildcard.replace('*', '.+')
    return [string for string in list_word if re.match(s_wildcard, string)]

def search_wildcard(wildcard, k, index, docs):
    wildcard_options = generate_wildcard_options(wildcard, k, index)
    list_fact = []
    for term in wildcard_options:
        for line in docs:
            if re.search(term, line, flags=re.I):
                list_fact.append(line)      
    return list_fact

#### Crawling IMDB Web

In [441]:
total_movies = 5000
start = [str(i+1) for i in range(0, total_movies, 250)]

In [442]:
# Declaring the lists to store data in
image_link = []
movie_name = []
movie_year = []
certificate = []
duration = []
genre = []
imdb_ratings = []
overview = []
directors = []
stars = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0
i = 0
# For every start number of movie
for st in start:
    # Make a get request
    # for 100k movie title without filter
    #url = 'https://www.imdb.com/search/title/?moviemeter=,101357&sort=num_votes,desc&count=250&start='+st+'&ref_=adv_nxt'
    # with filter: Feature Film/TV Series, Released at least 1990-01-01
    url = 'https://www.imdb.com/search/title/?title_type=feature,tv_series&release_date=1990-01-01,&sort=num_votes,desc&count=250&start='+st+'&ref_=adv_nxt'
    headers = {"Accept-Language": "en-US, en;q=0.5"}
    response = get(url, headers = headers)
    
    # Pause the loop
    sleep(randint(8,15))
    
    # Monitor the requests
    requests += 1
    elapsed_time = time() - start_time
    print('Request: {}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
    clear_output(wait = True)
    
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    
    # Select all the 250 movie containers from a single page
    mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    # For every movie
    for container in mv_containers:
        # If the movie has Metascore, then extract:
        #if container.find('div', class_ = 'ratings-metascore') is not None:
        # Image link
        link = container.find('img')
        image_link.append(link.get('loadlate'))

        # The name of movie
        name = container.h3.a.text
        movie_name.append(name)

        # The year of movie
        year = container.h3.find('span', class_ = 'lister-item-year').text
        movie_year.append(year)
        
        # Certificate
        try:
            cert = container.p.find('span', class_ = 'certificate').text
            certificate.append(cert.replace('\n', '').strip())
        except:
            certificate.append('Not Rated')

        # Duration
        try:
            runtime = container.p.find('span', class_ = 'runtime').text
            duration.append(runtime.replace('\n', '').strip())
        except:
            duration.append('Unknown')
        
        # The genre of movie
        gen = container.p.find('span', class_ = 'genre').text
        genre.append(gen.replace('\n', '').strip())
        
        # The IMDB rating
        imdb = float(container.strong.text)
        imdb_ratings.append(imdb)

        # Description
        view = container.find('div', class_ = "lister-item-content").text     
        res = view.replace('\n', '')
        if 'Metascore' in res:
            if 'Director' in res:
                result = re.search('Metascore(.*)Director', res)
            else:
                result = re.search('Metascore(.*)Star', res)
        else:
            if 'Director' in res:
                result = re.search('X(.*)Director', res)
            else:
                result = re.search('X(.*)Star', res)
        overview.append(result.group(1).strip())

        # The director of movie
        try:
            if 'Director' in res:
                direct = container.find('p', class_ = "").text
                movie_dir = direct.split('|')[0].rstrip()
                movie_dirs = movie_dir.split('\n')[2:]
                movie_director = [movie_dir.replace(",","").strip() for movie_dir in movie_dirs]
                director_name = ', '.join(str(name) for name in movie_director)
                directors.append([director_name])
            else:
                directors.append('')
        except:
            directors.append('')

        # Stars
        try:
            if 'Director' in res:
                star = container.find('p', class_ = "").text
                m_star = star.split('|')[1].rstrip()
                movie_st = m_star.split('\n')[2:]
                movie_star = [m_star.replace(",","").strip() for m_star in movie_st]
                star_name = ', '.join(str(name) for name in movie_star)
                stars.append([star_name])
            else:
                star = container.find('p', class_ = "").text
                m_star = star.split('|')[0].rstrip()
                movie_st = m_star.split('\n')[2:]
                movie_st.pop(0)
                movie_star = [m_star.replace(",","").strip() for m_star in movie_st]
                star_name = ', '.join(str(name) for name in movie_star)
                stars.append([star_name])
        except:
            stars.append('')
        
        # The number of votes
        vote = container.find('span', attrs = {'name':'nv'})['data-value']
        votes.append(int(vote))
        
        print(i)
        clear_output(wait = True)
        i += 1

crawl_time = time() - start_time
print("total time getting data is {} ms".format(crawl_time))

total time getting data is 423.5388810634613 ms


#### Preprocessing Data

In [443]:
k = 27
img_size = 'V1_SY1000_CR0,0,675,1000_AL_.jpg'

img_poster = [sub[:-k] + img_size for sub in image_link]  

In [444]:
fix_year = []
s = '-'
for year in movie_year:
    temp = re.findall(r'\d+', year) 
    if len(temp) > 1:
        res = s.join(temp)
        fix_year.append(res)
        #print('1', type(res))
    else:
        res = list(map(int, temp))
        fix_year.append(*res)
        #print('2', type(res))
    #print(res)

In [445]:
print(fix_year)

[1994, 2008, 2010, 1999, 1994, 1994, '2011-2019', 1999, 2001, 2003, 2012, 2002, 2014, 1995, '2008-2013', 2012, 2000, 2005, 2012, 1991, 2009, 1998, 1993, 2006, 2006, 2013, 1999, 2009, 2000, 2010, 1999, 2014, 1997, 1998, 2005, 2003, 1994, 1990, 2003, 1991, 2008, 1995, 1995, 2003, 2008, 1994, 2009, 1998, 1999, 1992, 2004, 2016, 1995, 2015, 2015, 1993, 2010, 1997, 2012, 2014, 2001, 2007, 2001, 2002, 2019, 2008, 2018, 2010, '1994-2004', 2000, 2013, 2012, 2000, 2016, 2011, 2013, 2005, 2010, 2015, 2011, 2011, 2001, 2006, 2019, 2011, 2015, 2014, 1999, 1998, 2008, 2009, 2010, 2002, '2007-2019', 2005, 2001, 2010, 2002, 2014, 2014, 2015, 2013, 2002, 2007, 2004, 2014, 2014, 2012, 2010, 2006, 2004, 2016, 2011, 2009, '2006-2013', 2010, 2002, 1996, 2007, 2012, 2004, 2017, 2013, 2001, 2001, 2016, 2006, 2010, 2013, 1997, 2007, 2001, 1996, 2013, 2007, 2013, 2015, 2018, 2007, 2013, 2014, 2016, 2013, '2005-2014', 2014, 2009, 2009, 2013, 2008, 2016, 2006, 2012, 1993, 2015, 2016, 1995, 2012, 2012, 2000, 201

In [447]:
movie_rating = pd.DataFrame({'movie_img': img_poster,
                             'movie': movie_name,
                             'year': fix_year,
                             'certificate': certificate,
                             'duration': duration,
                             'genre': genre,
                             'description': overview,
                             'director': directors,
                             'stars': stars,
                             'imdb': imdb_ratings,
                             'votes': votes
                            })

In [448]:
movie_rating.head(10)

Unnamed: 0,movie_img,movie,year,certificate,duration,genre,description,director,stars,imdb,votes
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,R,142 min,Drama,Two imprisoned men bond over a number of years...,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",9.3,2235501
1,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,PG-13,152 min,"Action, Crime, Drama",When the menace known as the Joker wreaks havo...,[Christopher Nolan],"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",9.0,2206475
2,https://m.media-amazon.com/images/M/MV5BMjAxMz...,Inception,2010,PG-13,148 min,"Action, Adventure, Sci-Fi",A thief who steals corporate secrets through t...,[Christopher Nolan],"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...",8.8,1959988
3,https://m.media-amazon.com/images/M/MV5BMmEzNT...,Fight Club,1999,R,139 min,Drama,An insomniac office worker and a devil-may-car...,[David Fincher],"[Brad Pitt, Edward Norton, Meat Loaf, Zach Gre...",8.8,1779638
4,https://m.media-amazon.com/images/M/MV5BNGNhMD...,Pulp Fiction,1994,R,154 min,"Crime, Drama","The lives of two mob hitmen, a boxer, a gangst...",[Quentin Tarantino],"[John Travolta, Uma Thurman, Samuel L. Jackson...",8.9,1752001
5,https://m.media-amazon.com/images/M/MV5BNWIwOD...,Forrest Gump,1994,PG-13,142 min,"Drama, Romance","The presidencies of Kennedy and Johnson, the e...",[Robert Zemeckis],"[Tom Hanks, Robin Wright, Gary Sinise, Sally F...",8.8,1723690
6,https://m.media-amazon.com/images/M/MV5BYTRiND...,Game of Thrones,2011-2019,TV-MA,57 min,"Action, Adventure, Drama",Nine noble families fight for control over the...,,"[Emilia Clarke, Peter Dinklage, Kit Harington,...",9.3,1671365
7,https://m.media-amazon.com/images/M/MV5BNzQzOT...,The Matrix,1999,R,136 min,"Action, Sci-Fi",A computer hacker learns from mysterious rebel...,"[Lana Wachowski, Lilly Wachowski]","[Keanu Reeves, Laurence Fishburne, Carrie-Anne...",8.7,1604802
8,https://m.media-amazon.com/images/M/MV5BN2EyZj...,The Lord of the Rings: The Fellowship of the Ring,2001,PG-13,178 min,"Action, Adventure, Drama",A meek Hobbit from the Shire and eight compani...,[Peter Jackson],"[Elijah Wood, Ian McKellen, Orlando Bloom, Sea...",8.8,1594652
9,https://m.media-amazon.com/images/M/MV5BNzA5ZD...,The Lord of the Rings: The Return of the King,2003,PG-13,201 min,"Adventure, Drama, Fantasy",Gandalf and Aragorn lead the World of Men agai...,[Peter Jackson],"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",8.9,1581507


In [449]:
movie_rating.to_csv('movie_data_5k.csv')

In [450]:
print(movie_rating.info())
movies = movie_rating

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 11 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie_img    5000 non-null   object 
 1   movie        5000 non-null   object 
 2   year         5000 non-null   object 
 3   certificate  5000 non-null   object 
 4   duration     5000 non-null   object 
 5   genre        5000 non-null   object 
 6   description  5000 non-null   object 
 7   director     5000 non-null   object 
 8   stars        5000 non-null   object 
 9   imdb         5000 non-null   float64
 10  votes        5000 non-null   int64  
dtypes: float64(1), int64(1), object(9)
memory usage: 429.8+ KB
None
