In [None]:
from bs4 import BeautifulSoup
import pandas as pd
import requests
import time
import re
from rotten_tomatoes_scraper.rt_scraper import MovieScraper

In [None]:
url = ('http://www.imdb.com/search/title?count={batch_size}&view=simple&title_type=feature&release_date={year}&start={start}&countries=us')

headers = {
    'Accept-Language': 'en-US',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

In [None]:
def get_movies(year, start = 1, batch_size = 50):
    '''Get list of movies released in <year>.'''
    movies_html = requests.get(url.format(batch_size=batch_size, start=start, year=year), headers=headers, timeout=15).content
    soup = BeautifulSoup(movies_html, 'html.parser')
    movies = soup.find_all('a', href=re.compile('adv_li_tt'))

    return [('http://www.imdb.com' + m['href'], m.text) for m in movies]

In [None]:
def get_all_movies(year):
  batch_size = 250
  all_movies = []
  i = 0
  while True:
    movies = get_movies(year, batch_size * i + 1, batch_size)
    all_movies += movies
    i += 1
    if len(movies) == 0:
      break
    print(i * batch_size + 1)
    time.sleep(1)
  return all_movies  

In [None]:
def go_to_movie(url):
    '''Get IMDb page of a movie.'''
    movie_html = requests.get(url, headers=headers).content

    return movie_html

In [None]:
def get_country(soup):
    wrapper = soup.find('span', text='Country of origin')
    if not wrapper:
        wrapper = soup.find('span', text='Countries of origin')

    try:
        wrapper = wrapper.findNext('div')
    except AttributeError:
        return []

    try:
        countries = []
        wrappers = wrapper.find_all('a')
        for e in wrappers:
          countries.append(e.text)
        return countries
    except AttributeError:
        return []

In [None]:
def get_genre(soup):
    try:
      wrapper = soup.find('span', text="Genres").find_next('div')
      wrappers = wrapper.find_all('a')
      genres = []
      for e in wrappers:
        genres.append(e.text)
      return genres
    except:
      try:
        wrapper = soup.find('span', text="Genre").find_next('div')
        return [wrapper.find('a').text]
      except:
        return []

In [None]:
def get_votes(soup):
    try:
        wrapper = soup.find('div', {'class': re.compile('sc-7ab21ed2-3 dPVcnq')}).text
    except AttributeError:
        return 0
    
    if 'K' in wrapper:
        votes = float(wrapper.replace('K', '')) * 1000
    elif 'M' in wrapper:
        votes = float(wrapper.replace('M', '')) * 1000000
    else:
        votes = float(wrapper)
    return votes

In [None]:
def get_money(soup, type):
    try:
        wrapper = soup.find('span', text=type).findNext('div')
        money = wrapper.find('span').text
        
        return money
    except AttributeError:
        return None

In [None]:

def get_company(soup):
    wrapper = soup.find('a', text='Production companies')

    if not wrapper:
        wrapper = soup.find('a', text='Production company')

    companies = []
    try:
        companies_wrapper = wrapper.findNext('div').find_all('a')
        for e in companies_wrapper:
          companies.append(e.text)
    except AttributeError:
        return []

    return companies

In [None]:
def get_release_date(soup):
    try:
        wrapper = soup.find('a', text='Release date').findNext('div')
        release_date = wrapper.find('a').text
    except AttributeError:
        return None

    for i in range(len(release_date)):
      if release_date[i] == '(':
        release_date = release_date[:i]
        break
    return pd.to_datetime(release_date)

In [None]:
def get_runtime(soup):
    try:
        wrapper = soup.find('span', text='Runtime').findNext('div')
    except AttributeError:
        return None
    runtime = wrapper.text.split()

    if len(runtime) == 4:
        hours = int(runtime[0])
        minutes = int(runtime[2])        
        return hours * 60 + minutes
    elif runtime[1] == "minutes":
        return int(runtime[0])
    else:
        return int(runtime[0]) * 60

In [None]:
def get_star(soup):
  stars = []
  try:
    wrappers = soup.find_all('a', {'class': re.compile('sc-11eed019-1 jFeBIw')})
    for wrapper in wrappers:
      stars.append(wrapper.text)
    return stars
  except:
    return stars

In [None]:
def get_writer(soup):
    try:
        writer = soup.find('a', {'href': re.compile('tt_ov_wr')}).text
    except AttributeError:
        return []
    
    if writer == 'Writers':
        try:
            wrapper = soup.find('a', text=writer).findNext('div')
        except AttributeError:
            wrapper = soup.find('span', text=writer).findNext('div') 
        wrappers = wrapper.find_all('a')

        writers = []
        for e in wrappers:
          writers.append(e.text)

        return writers
    else:
      return [writer]

In [None]:
def get_imdb_score(soup):
    wrapper = soup.find('span', {'class': re.compile('sc-7ab21ed2-1 jGRxWM')})
    if not wrapper:
        return None
    else:
        return float(wrapper.text)

In [None]:
def get_rotten_score(title):
    try:
      movie_scraper = MovieScraper(movie_title=title)
      movie_scraper.extract_metadata()
      metadata = movie_scraper.metadata
      score = int(metadata['Score_Audience'])
    except:
      score = None

    return score

In [None]:
def scrap_titlebar(soup, movie_title, year):
    '''Get name, rating, genre, year, release date, score and votes of a movie.'''
    try:
      name = soup.find('h1', {'data-testid': re.compile('hero-title-block__title')}).text.strip()
    except:
      name = None
    genre = get_genre(soup)
    imdb_score = get_imdb_score(soup)
    rotten_score = get_rotten_score(name)
    votes = get_votes(soup)
    released = get_release_date(soup)
    if released != None and not pd.isnull(released):
      year = released.year
    try:
        rating = soup.find('a', {'href': re.compile('tt_ov_pg')}).text
    except AttributeError:
        rating = None

    titlebar = {
        'name': movie_title,
        'rating': rating,
        'genre': genre,
        'released': released,
        'year': year,
        'imdb_score': imdb_score,
        'rotten_score': rotten_score,
        'votes': votes
    }

    return titlebar

In [None]:
def scrap_crew(soup):
    '''Get director, writer and star of a movie.'''
    directors = []
    try:
        wrapper = soup.find('div',{'class': 'sc-fa02f843-0 fjLeDR'})
        directors_wrapper = wrapper.find_all('a', {'href': re.compile('tt_ov_dr')})
        for director_wrapper in directors_wrapper:
          directors.append(director_wrapper.text)
    except:
      directors = []
    writer = get_writer(soup)
    star = get_star(soup)

    crew = {
        'director': directors,
        'writer': writer,
        'star': star
    }

    return crew

In [None]:
def scrap_details(soup):
    '''Get country, budget, gross, production co. and runtime of a movie.'''
    countries = get_country(soup)
    gross = get_money(soup, type='Gross worldwide')
    budget = get_money(soup, type='Budget')
    company = get_company(soup)
    runtime = get_runtime(soup)
    if budget:
        if not '$' in budget:
            budget = None
        else:
            try:
                budget = float(budget.split()[0].replace('$','').replace(',',''))
            except ValueError:
                budget = None

    if gross:
      if not '$' in gross:
        gross = None
      else:
        try:
          gross = float(gross.replace('$','').replace(',',''))
        except:
          gross = None

    details = {
        'country': countries,
        'budget': budget,
        'gross': gross,
        'company': company,
        'runtime': runtime
    }

    return details

In [None]:
def try_open_csv(year):
  try:
    df = pd.read_pickle("PYBD/projet/data/" + str(year) + ".pkl") 
  except:
    df = pd.DataFrame()

  movies = pd.read_csv("PYBD/projet/data/" + str(year) + "_links.csv")
  return (df, movies)

In [None]:
def write_csv(data, name, df):
    '''Write list of dicts to csv.'''
    df = pd.concat([df, pd.DataFrame(data)])
    df.to_pickle("PYBD/projet/data/" + str(name) + ".pkl")
    print("Saved!")
    return df

In [None]:
def scrap_links(year):
  all_movie_links = []
  links_and_title = get_all_movies(year)
  for (url, title) in links_and_title:
    all_movie_links.append({ 'url':url, 'title':title })
  df = pd.DataFrame(all_movie_links)
  df.to_csv("PYBD/projet/data/" + str(year) + "_links.csv", index=False)

In [None]:
def scrap(year, batch_size):
      all_movie_data = []
      (df, movies) = try_open_csv(year)
      for i in range(len(movies)):
          (movie_url, movie_title) = (movies.iloc[i]['url'], movies.iloc[i]['title'])
          if not df.empty and len(df[df['name'] == movie_title]) > 0:
            continue
          print("starting: " + movie_title + " " + movie_url, end='')
          movie_data = {}
          movie_html = go_to_movie(movie_url)
          soup = BeautifulSoup(movie_html, 'html.parser')
          movie_data.update(scrap_titlebar(soup, movie_title, year))
          if movie_data['name'] == None:
            with open("PYBD/projet/data/title_missing.txt", "a+") as file_:
              file_.write(movie_url + " " + str(year)  + " \n")
          movie_data.update(scrap_crew(soup))
          movie_data.update(scrap_details(soup))
          all_movie_data.append(movie_data)
          print(" - done")
          time.sleep(1)
          if (len(all_movie_data) >= batch_size):
            df = write_csv(all_movie_data, year, df)
            all_movie_data = []
            print("COMPLETION: " + str(round(i/len(movies) * 100, 1)) + "%")
      print(year, 'done.')
      write_csv(all_movie_data, year, df)

In [None]:
for i in range(1999, 1989, -1):
  scrap_links(i)

In [None]:
def scrap_all():
  for i in range(1999, 1969, -1):
    error = True
    while error:
      error = False
      try:
        scrap(i, 10)
      except KeyboardInterrupt:
        return
      except:
        error = True
        time.sleep(5)

In [None]:
scrap_all()

[1;30;43mLe flux de sortie a été tronqué et ne contient que les 5000 dernières lignes.[0m
starting: Mission Hill http://www.imdb.com/title/tt0084337/?ref_=adv_li_tt - done
starting: Tennessee Stallion http://www.imdb.com/title/tt0078376/?ref_=adv_li_tt - done
starting: Seraphita's Diary http://www.imdb.com/title/tt0081484/?ref_=adv_li_tt - done
starting: Rude http://www.imdb.com/title/tt0362100/?ref_=adv_li_tt - done
Saved!
COMPLETION: 81.6%
starting: Plainsong http://www.imdb.com/title/tt0084511/?ref_=adv_li_tt - done
starting: Lenz http://www.imdb.com/title/tt8760392/?ref_=adv_li_tt - done
starting: Boss Lady http://www.imdb.com/title/tt0310791/?ref_=adv_li_tt - done
starting: A Different Image http://www.imdb.com/title/tt0180645/?ref_=adv_li_tt - done
starting: Greater Than Gold http://www.imdb.com/title/tt1229782/?ref_=adv_li_tt - done
starting: Uncensored Cartoons http://www.imdb.com/title/tt0158300/?ref_=adv_li_tt - done
starting: Whitewater Sam http://www.imdb.com/title/tt0076