# Data Collection and Analysis of Movie Data
### Authors: Jan Hanzal, Eric Zila

In [1]:
import json
import re
import requests
from bs4 import BeautifulSoup

In [2]:
"""
Downloader of movie data starting from an IMDb list page.
"""
class ImdbDownloader:
    """
    Initiates ImdbDownloader class.
    
    @param link -> link to the first IMDb list page to be downloaded from
    """
    def __init__(self, link):
        self.original_link = link # original link passed to the downloader
        self.movies = []
    
    """
    Controls the downloading process.
    
    @param pages -> number of pages to be downloaded, if not defined, all pages are downloaded
    """
    def start(self, pages=-1):
        cont = True # used for indication of reaching the last page
        count = 0 # counter of the number of movies downloaded
        cur_link = self.original_link # link to a website that is currently downloaded from
        
        while cont and pages != 0:
            cur_soup = self.get_soup(cur_link)
            ls = ImdbListScraper(cur_soup)
            cur_movie_links = ls.get_movie_links()
            
            for cur_movie_link in cur_movie_links:
                cur_movie_soup = self.get_soup(cur_movie_link)
                ms = ImdbMovieScraper(cur_movie_soup)
                
                title = ms.get_title()
                original_title = ms.get_original_title()
                worldwide_gross = ms.get_worldwide_gross()
                rating = ms.get_rating()
                
                movie = Movie(title, original_title, worldwide_gross, rating)
                self.movies.append(movie)
                
                count += 1
                if (count % 5) == 0 :
                    print("We have downloaded " + str(count) + " movies so far!")

            cur_link = ls.get_next_page_link()
            if cur_link is None:
                cont = False
                
            pages = pages - 1
            #self.df = FilmDf(self.links).dikt
    
    """
    Obtains the soup of a web page.
    
    @param link -> link to the web page a soup should be downloaded from
    
    @return BeautifulSoup object containing requested soup
    """
    def get_soup(self, link):
        request = requests.get(link)
        request.encoding = 'UTF-8'
        soup = BeautifulSoup(request.text)
        
        return soup
    
    """
    Prints information about all movies that were scraped so far.
    """
    def print_movies(self):
        print("\nAbout to print " + str(len(self.movies)) + " movies!\n")
        
        for movie in self.movies:
            print(movie.to_string())

In [3]:
"""
Scraper of information from an IMDb list page.
"""
class ImdbListScraper:
    """
    Initiates ImdbListScraper class.
    
    @param soup -> soup of an IMDb list page to be scraped
    """
    def __init__(self, soup):
        self.soup = soup
        
    """
    Obtains all movie links from an IMDb list soup.
    
    @return Array object containing movie links
    """
    def get_movie_links(self):
        movies = self.soup.find_all('img', {'class':'loadlate'})
        links = ['https://www.imdb.com' + movie.parent['href'] for movie in movies]
        
        return links
    
    """
    Obtains a link to the next page from an IMDb list soup.
    
    @return String object containing link to the next page, None object if not found
    """
    def get_next_page_link(self):
        next_page = self.soup.find('a', {'class':'lister-page-next next-page'})
        
        if next_page is not None:
            return 'https://www.imdb.com' + next_page['href']
        else:
            return None

In [4]:
"""
Scraper of information from an IMDb movie page.
"""
class ImdbMovieScraper:
    """
    Initiates ImdbMovieScraper class.
    
    @param soup -> soup of an IMDb movie page to be scraped
    """
    def __init__(self, soup):
        self.soup = soup
        
    """
    Obtains the title of a movie on an IMDb movie page.
    
    @return String object containing title of a movie
    """
    def get_title(self):
        title = self.soup.find('div', {'class':'title_wrapper'})
        
        return title.h1.find(text=True, recursive=False)
    
    """
    Obtains the original title of a movie on an IMDb movie page.
    
    @return String object containing original title of a movie, None object if not found
    """
    def get_original_title(self):
        original_title = self.soup.find('div', {'class':'originalTitle'})
        
        if original_title is not None:
            return original_title.text
        else:
            return None
        
    """
    Obtains the worldwide gross of a movie on an IMDb movie page.
    
    @return String object containing worldwide gross of a movie, None object if not found
    """
    def get_worldwide_gross(self):
        worldwide_gross = self.soup.find('h4', text = 'Cumulative Worldwide Gross:')
        
        if worldwide_gross is not None:
            return re.sub("[^0-9]", "", worldwide_gross.parent.text)
        else:
            return None
        
    """
    Obtains the rating of a movie on an IMDb movie page.
    
    @return String object containing rating of a movie
    """
    def get_rating(self):
        rating = self.soup.find('span', itemprop="ratingValue")
        
        return rating.text

In [5]:
"""
Encapsulates information about a movie.
"""
class Movie:
    """
    Initiates Movie class.
    
    @param title -> title of the movie
    @param original_title -> original title of the movie
    @param worldwide_gross -> worldwide gross of the movie
    @param rating -> rating of the movie
    """
    def __init__(self, title, original_title, worldwide_gross, rating):
        self.title = title
        self.original_title = original_title
        self.worldwide_gross = worldwide_gross
        self.rating = rating
        
    """
    Builds a string containing all available information about the movie.
    
    @return String object containing all information
    """
    def to_string(self):
        string = "title: " + self.title + "\n"
        
        if self.original_title is not None:
            string += "original title: " + self.original_title + "\n"
            
        if self.worldwide_gross is not None:
            string += "worldwide gross: " + self.worldwide_gross + "\n"
            
        string += "rating: " + self.rating + "\n"
        
        return string

In [6]:
imdb = ImdbDownloader('https://www.imdb.com/search/title/?title_type=feature,tv_movie&num_votes=100000,&sort=release_date,desc&view=simple')
imdb.start(5)
imdb.print_movies()

We have downloaded 5 movies so far!
We have downloaded 10 movies so far!
We have downloaded 15 movies so far!
We have downloaded 20 movies so far!
We have downloaded 25 movies so far!
We have downloaded 30 movies so far!
We have downloaded 35 movies so far!
We have downloaded 40 movies so far!
We have downloaded 45 movies so far!
We have downloaded 50 movies so far!
We have downloaded 55 movies so far!
We have downloaded 60 movies so far!
We have downloaded 65 movies so far!
We have downloaded 70 movies so far!
We have downloaded 75 movies so far!
We have downloaded 80 movies so far!
We have downloaded 85 movies so far!
We have downloaded 90 movies so far!
We have downloaded 95 movies so far!
We have downloaded 100 movies so far!
We have downloaded 105 movies so far!
We have downloaded 110 movies so far!
We have downloaded 115 movies so far!
We have downloaded 120 movies so far!
We have downloaded 125 movies so far!
We have downloaded 130 movies so far!
We have downloaded 135 movies so