# Парсинг данных


На парсинг 5000 фильмов уходит ~20 минут. Данные не всегда причёсаны, как, например, с годом выхода, это обрабатывается в следующих лабах (хотя конкретно в этом вопросе обычно берётся год начала производства)


In [None]:
from requests import get
import requests
from bs4 import BeautifulSoup
import warnings
from time import sleep
from random import randint
import numpy as np
import seaborn as sns
import pandas as pd
from tqdm import tqdm
from google.colab import drive
import re

import json

import asyncio
import aiohttp

In [None]:
warnings.filterwarnings("ignore")

In [None]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir '/content/drive/MyDrive/Lab1'

mkdir: cannot create directory ‘/content/drive/MyDrive/Lab1’: File exists


In [None]:
with open('config.json', 'r') as f:
    params = json.load(f)

FileNotFoundError: ignored

In [None]:
path_data = '/content/drive/MyDrive/Lab1/'

In [None]:
params = {'url' : "https://www.imdb.com/search/title/?genres=drama",
    "movies_to_parse": "50000"}

In [None]:
class Crawler:
  def __init__(self, movies_to_parse, path_url):
      self.movies_to_parse = movies_to_parse
      self.path_url = path_url
      self.columns = ['title', 'genre', 'year', 'certificate', 'runtime', 'user-votes', 'imdb-scores', 'metacritic-scores', 'descriptions', 'stars']

      self.all_titles = pd.Series()
      self.all_years = pd.Series()
      self.all_genres = pd.Series()
      self.all_certificates = pd.Series()
      self.all_runtimes = pd.Series()
      self.all_votes = pd.Series()
      self.all_imdb_scores = pd.Series()
      self.all_metacritic_scores = pd.Series()
      self.all_descriptions = pd.Series()
      self.all_stars = pd.Series()

  def get_dataset(self):
      self.iterate_over_pages()

      data_titles = pd.DataFrame(self.all_titles, columns=['title'])
      data_genres = pd.DataFrame(self.all_genres, columns=['genre'])
      data_years = pd.DataFrame(self.all_years, columns=['year'])
      data_certificates = pd.DataFrame(self.all_certificates, columns=['certificate'])
      data_runtimes = pd.DataFrame(self.all_runtimes, columns=['runtime'])
      data_votes = pd.DataFrame(self.all_votes, columns=['user-votes'])
      data_imdb_scores = pd.DataFrame(self.all_imdb_scores, columns=['imdb-scores'])
      data_metacritic_scores = pd.DataFrame(self.all_metacritic_scores, columns=['metacritic-scores'])
      data_descriptions = pd.DataFrame(self.all_descriptions, columns=['descriptions'])
      data_stars = pd.DataFrame(self.all_stars, columns=['stars'])

      data = pd.concat([data_titles, data_genres, data_years, data_certificates, data_runtimes, data_votes, data_imdb_scores, data_metacritic_scores, data_descriptions,
                        data_stars], axis=1, ignore_index = True)
      data.columns = self.columns

      return data

  def iterate_over_pages(self):
      for i in tqdm(range(1, self.movies_to_parse, 50)):
          url = self.path_url + "&start=" + str(i) + "&ref_=adv_nxt"
          current_soup = self.get_bs(url)
          parser = Parser(current_soup)

          self.get_titles(parser)
          self.get_years(parser)
          self.get_genres(parser)
          self.get_age_rating(parser)
          self.get_runtimes(parser)
          self.get_imdb_scores(parser)
          self.get_metacritic_scores(parser)
          self.get_votes(parser)
          self.get_descriptions(parser)
          self.get_stars(parser)


  def get_bs(self, url):
      response = requests.get(url=url)
      soup = BeautifulSoup(response.text, "html.parser")
      return soup

  def get_titles(self, parser):
      page_titles = Parser.unique_movie_title(parser)
      self.all_titles = pd.concat([self.all_titles, page_titles], ignore_index = True)

  def get_genres(self, parser):
      page_genres = Parser.unique_movie_genre(parser)
      self.all_genres = pd.concat([self.all_genres, page_genres], ignore_index = True)

  def get_years(self, parser):
      page_years = Parser.unique_movie_year(parser)
      self.all_years = pd.concat([self.all_years, page_years], ignore_index = True)

  def get_age_rating(self, parser):
      page_certificate = Parser.unique_movie_age_rating(parser)
      self.all_certificates = pd.concat([self.all_certificates, page_certificate], ignore_index = True)

  def get_runtimes(self, parser):
      page_runtimes = Parser.unique_movie_runtime(parser)
      self.all_runtimes = pd.concat([self.all_runtimes, page_runtimes], ignore_index = True)

  def get_imdb_scores(self, parser):
      page_scores = Parser.unique_movie_imdb_score(parser)
      self.all_imdb_scores = pd.concat([self.all_imdb_scores, page_scores], ignore_index = True)

  def get_votes(self, parser):
      page_votes = Parser.unique_movie_vote(parser)
      self.all_votes = pd.concat([self.all_votes, page_votes], ignore_index = True)

  def get_metacritic_scores(self, parser):
      page_metacritic_scores = Parser.unique_movie_metacritic_score(parser)
      self.all_metacritic_scores = pd.concat([self.all_metacritic_scores, page_metacritic_scores], ignore_index = True)

  def get_descriptions(self, parser):
      page_descriptions = Parser.unique_movie_description(parser)
      self.all_descriptions = pd.concat([self.all_descriptions, page_descriptions], ignore_index = True)

  def get_stars(self, parser):
      page_stars = Parser.unique_stars(parser)
      self.all_stars = pd.concat([self.all_stars, page_stars], ignore_index = True)

In [None]:
class Parser:
  def __init__(self, soup):
      self.soup = soup

  # 1. Название
  def unique_movie_title(self):
      unique_strings_title = set()

      div_tags = self.soup.find_all("div", class_="lister-item-content")
      unique_strings_title = [div_tag.find("h3", class_="lister-item-header").find("a").text for div_tag in div_tags]

      titles = pd.Series(unique_strings_title)
      return titles

  # 2. Год производства
  def unique_movie_year(self):
      unique_strings_year = set()

      div_tags = self.soup.find_all("div", class_="lister-item-content")
      unique_strings_year = [div_tag.find("h3", class_="lister-item-header").find("span", class_="lister-item-year text-muted unbold").text for div_tag in div_tags]

      cleaned_data = []
      for year in unique_strings_year:
        cleaned_item = re.sub(r'[^0-9]+', '', year)
        if '-' in cleaned_item:
            cleaned_item = cleaned_item.split('-')[0]
        cleaned_data.append(cleaned_item)

      years = pd.Series(cleaned_data)
      return years

  # 3. Жанр
  def unique_movie_genre(self):
      unique_strings_movie_genre = set()

      div_tags = self.soup.find_all("div", class_="lister-item-content")
      unique_strings_movie_genre = [div_tag.find("p", class_="text-muted").find("span", class_ = "genre").text for div_tag in div_tags]
      genres = [genre.replace(' ', '').strip() for genre in unique_strings_movie_genre]

      genres = pd.Series(genres)
      return genres

  # 4. Возрастной ценз
  def unique_movie_age_rating(self):
      unique_strings_movie_age_rating = []

      div_tags = self.soup.find_all("div", class_="lister-item-content")

      for div_tag in div_tags:
        try:
          certificate = div_tag.find("p", class_="text-muted").find("span", class_ = "certificate").text
        except:
          certificate = "No data"
        unique_strings_movie_age_rating.append(certificate)

      ratings = pd.Series(unique_strings_movie_age_rating)

      return ratings


  # 5. Продолжительность
  def unique_movie_runtime(self):
      unique_strings_movie_runtime = []

      div_tags = self.soup.find_all("div", class_="lister-item-content")

      for div_tag in div_tags:
        try:
          certificate = div_tag.find("p", class_="text-muted").find("span", class_="runtime")
          certificate = certificate.text.removesuffix(" min")
        except:
          certificate = "No data"
        unique_strings_movie_runtime.append(certificate)

      runtime = pd.Series(unique_strings_movie_runtime)

      return runtime

  # 6. Рейтинг imdb
  def unique_movie_imdb_score(self):
      unique_movie_imdb_score = []

      div_tags = self.soup.find_all("div", class_="lister-item-content")

      for div_tag in div_tags:
        try:
          score = div_tag.find("div", class_="ratings-bar").find("div", class_="inline-block ratings-imdb-rating")
          score = score.text
        except:
          score = "No data"
        unique_movie_imdb_score.append(score)

      scores = [score.strip() for score in unique_movie_imdb_score]
      scores = pd.Series(scores)

      return scores

  # 7. Количество голосов
  def unique_movie_vote(self):
      unique_movie_vote = []

      div_tags = self.soup.find_all("div", class_="lister-item-content")

      for div_tag in div_tags:
        try:
          vote = div_tag.find("p", class_="sort-num_votes-visible")
          vote = (re.findall(r'[0-9]+', str(vote)))[0]
        except:
          vote = "No data"
        unique_movie_vote.append(vote)

      votes = pd.Series(unique_movie_vote)
      return votes


  # 8. Рейтинг Metacritic
  def unique_movie_metacritic_score(self):
      unique_movie_metacritic_score = []

      div_tags = self.soup.find_all("div", class_="lister-item-content")

      for div_tag in div_tags:
        try:
          score = div_tag.find("div", class_="ratings-bar")
          score = score.find("div", class_="inline-block ratings-metascore").text
          unique_movie_metacritic_score.append(re.findall(r'\d+', score)[0])
        except:
          unique_movie_metacritic_score.append("No data")

      scores = pd.Series(unique_movie_metacritic_score)
      return scores


  # 9. Описания режиссёры, актёры
  def unique_movie_description(self):
      unique_strings_movie_description = []

      for div_tags in self.soup.select("div.lister-item-content > p.text-muted:nth-of-type(2)"):
          try:
            descriptions = div_tags.text.strip('\n')
          except:
            descriptions = None

          unique_strings_movie_description.append(descriptions)

      descriptions = pd.Series(unique_strings_movie_description)

      return descriptions

  #10. Звёзды
  def unique_stars(self):
      unique_stars = []

      for div_tags in self.soup.select("div.lister-item-content > p:nth-of-type(3)"):
        try:
          unique_star = div_tags.text.strip('\n')
        except:
          unique_star = None
        unique_stars.append(unique_star)
      stars = pd.Series(unique_stars)

      return stars

In [None]:
crawl = Crawler(int(params["movies_to_parse"]), params["url"])
df_movies = Crawler.get_dataset(crawl)


100%|██████████| 1000/1000 [19:57<00:00,  1.20s/it]


In [None]:
df_movies.to_csv('drive/MyDrive/Lab1/movies_50000.csv')