## Scraping Phase

In [3]:
import pandas as pd
import numpy as np
import re

import seaborn as sns
import matplotlib.pyplot as plt

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import requests
from bs4 import BeautifulSoup
from time import sleep
import os
from dotenv import load_dotenv

load_dotenv()


True

In [3]:
mdb = os.getenv("moviedb_token")
mdb2 = os.getenv("moviedb_token2")

In [6]:
url_types = "https://moviesdatabase.p.rapidapi.com/titles/utils/titleTypes"

headers = {
	"x-rapidapi-key": mdb,
	"x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
}

response_types = requests.get(url_types, headers=headers)

print(response_types.json())

{'results': [None, 'movie', 'musicVideo', 'podcastEpisode', 'podcastSeries', 'short', 'tvEpisode', 'tvMiniSeries', 'tvMovie', 'tvPilot', 'tvSeries', 'tvShort', 'tvSpecial', 'video', 'videoGame']}


In [225]:
def get_movies(types, genres, years, n_pages):
    movies = []
    key = mdb
    for title_type in types:
        print(title_type)
        for genre in genres:
            print(genre)
            for year in tqdm(years):
                for page in range(1,n_pages+1):
                    url = "https://moviesdatabase.p.rapidapi.com/titles"
                    querystring = {"genre":genre,"year":f"{year}","titleType":title_type,"limit":"50", "page" : page}
                    headers = {
                        "x-rapidapi-key": key,
                        "x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
                    }

                    response = requests.get(url, headers=headers, params = querystring)
                    if response.status_code != 200:
                        print("Request limit reached, changing key...")
                        key = mdb2
                        headers = {
                        "x-rapidapi-key": key,
                        "x-rapidapi-host": "moviesdatabase.p.rapidapi.com"
                        }   
                        response = requests.get(url, headers=headers, params = querystring)

                    response = response.json()
                    result_len = len(response["results"])
                    if response["page"] == 1 or result_len == 0:
                        print(f"Limit page reached, page : {page}")
                        break
                    movie_year = [response["results"][i]["releaseYear"]["year"] for i in range(result_len)]
                    movie_month = [response["results"][i]["releaseDate"]["month"] if response["results"][i]["releaseDate"] else None  for i in range(result_len)]
                    movie_title = [response["results"][i]["titleText"]["text"] for i in range(result_len)]
                    movie_id = [response["results"][i]["id"] for i in range(result_len)]
                    
                    movies_page_list = [(title_type,my,mm,mt,mi,genre) for my, mm, mt, mi in zip(movie_year, movie_month, movie_title, movie_id)]
                    movies += movies_page_list
    return movies


In [226]:
title_types = ["movie", "short"]
genres = ["Drama", "Comedy", "Action", "Fantasy", "Horror", "Mystery", "Romance", "Thriller"]
years = range(1990,2025)


response = get_movies(title_types, genres, years, n_pages=1)

movie
Drama


  0%|          | 0/35 [00:00<?, ?it/s]

Request limit reached, changing key...


100%|██████████| 35/35 [00:13<00:00,  2.51it/s]


Comedy


100%|██████████| 35/35 [00:13<00:00,  2.67it/s]


Action


100%|██████████| 35/35 [00:12<00:00,  2.83it/s]


Fantasy


100%|██████████| 35/35 [00:13<00:00,  2.65it/s]


Horror


100%|██████████| 35/35 [00:13<00:00,  2.67it/s]


Mystery


100%|██████████| 35/35 [00:12<00:00,  2.89it/s]


Romance


100%|██████████| 35/35 [00:12<00:00,  2.69it/s]


Thriller


100%|██████████| 35/35 [00:10<00:00,  3.21it/s]


short
Drama


100%|██████████| 35/35 [00:12<00:00,  2.85it/s]


Comedy


100%|██████████| 35/35 [00:12<00:00,  2.78it/s]


Action


100%|██████████| 35/35 [00:11<00:00,  3.16it/s]


Fantasy


100%|██████████| 35/35 [00:12<00:00,  2.72it/s]


Horror


100%|██████████| 35/35 [00:11<00:00,  3.01it/s]


Mystery


100%|██████████| 35/35 [00:13<00:00,  2.66it/s]


Romance


100%|██████████| 35/35 [00:11<00:00,  3.09it/s]


Thriller


100%|██████████| 35/35 [00:12<00:00,  2.73it/s]


In [251]:
df_response = pd.DataFrame(response, columns = ["type", "year", "month", "name", "id", "genre"])
df_response.to_csv("datos/temporal_result.csv")

In [252]:
df_response.head()

Unnamed: 0,type,year,month,name,id,genre
0,movie,1990,10.0,Jahrgang 45,tt0059325,Drama
1,movie,1990,10.0,"Wenn du groß bist, lieber Adam",tt0059900,Drama
2,movie,1990,10.0,"Vojtech, receny sirotek",tt0065188,Drama
3,movie,1990,8.0,Domo Arigato,tt0068494,Drama
4,movie,1990,3.0,Spy Story,tt0075259,Drama


In [16]:
def get_movie_info(id):
    options = webdriver.ChromeOptions()
    # options.add_argument("--headless") 
    options.add_argument("--lang=en")
    driver = webdriver.Chrome(options = options)
    driver.get(f"https://www.imdb.com/title/{id}/")
    WebDriverWait(driver, 5).until(EC.visibility_of_element_located((By.XPATH, "//*[@data-testid='hero-rating-bar__aggregate-rating__score']")))
    imdb_container = driver.find_element(By.XPATH, "//*[@data-testid='hero-rating-bar__aggregate-rating__score']")
    info_container = driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[2]/div/ul')
    info_ele = info_container.find_elements(By.XPATH, './li')
    staff_dict = {e.text.split('\n')[0] :re.sub(r'(?<![A-Z\W])(?=[A-Z])', '|', e.text.split('\n')[1]).split("|")[1:] for e in info_ele}
    description = driver.find_element(By.XPATH, "//*[@data-testid='plot']").text
    score = imdb_container.text.split()[0]
    try:
        duration = re.search("(\d+h\s)*\d{2}m", driver.find_element(By.XPATH, '//*[@id="__next"]/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul').text).group(0)
    except:
        duration = None
    movie_info = (id, score, list(staff_dict.items())[0][1], list(staff_dict.items())[1][1], duration, description)
    driver.quit()
    return movie_info

In [8]:
infos = []
for id in df_response["id"].unique()[:4]:
    movie_info = get_movie_info(id)
    infos.append(movie_info)


In [9]:
def to_minutes(text):
    text = text.replace("h ", ":").replace("m", "")
    splitted = text.split(":")
    try:
        return int(splitted[0])*60+int(splitted[1])
    except:
        return int(splitted[0])
     

In [10]:
df_response = pd.read_csv("datos/temporal_result.csv", index_col=0)

In [11]:
df_sample = df_response[(df_response["type"] == "movie") & (df_response["genre"] == "Drama")].reset_index(drop=True)

In [None]:
df_info = pd.DataFrame()
with ThreadPoolExecutor(max_workers=10) as executor:
    futures = []
    for id in df_sample["id"].unique()[:40]:
        futures.append(executor.submit(get_movie_info, id=id))
    for future in as_completed(futures):
        result = pd.DataFrame(future.result(), index = ["id", "rating", "directors", "writers", "duration", "description"]).T
        df_info = pd.concat([df_info, result], axis = 0)

In [11]:
df_info.reset_index(drop = True, inplace=True)

In [15]:
df_response

Unnamed: 0,type,year,month,name,id,genre
0,movie,1990,10.0,Jahrgang 45,tt0059325,Drama
1,movie,1990,10.0,"Wenn du groß bist, lieber Adam",tt0059900,Drama
2,movie,1990,10.0,"Vojtech, receny sirotek",tt0065188,Drama
3,movie,1990,8.0,Domo Arigato,tt0068494,Drama
4,movie,1990,3.0,Spy Story,tt0075259,Drama
...,...,...,...,...,...,...
27018,short,2024,2.0,Under the Influence,tt30062866,Thriller
27019,short,2024,2.0,Poor Ray,tt30097808,Thriller
27020,short,2024,1.0,Another,tt30101647,Thriller
27021,short,2024,,Resolute,tt30185836,Thriller


In [18]:
df_info["duration"] = df_info["duration"].apply(to_minutes)
df_info["rating"] = df_info["rating"].astype(float)

df_info.head()

Unnamed: 0,id,rating,directors,writers,duration,description
0,tt0059900,6.5,[Egon Günther],"[Egon Günther, Helga Schütz]",78,Adam receives a flashlight with special powers...
1,tt0093210,5.9,[Fernando Durán Rojas],[Jorge Barragán],102,A hobo (Julio Alemán) is given the power to kn...
2,tt0081721,6.2,[Michael Rubbo],[Michael Rubbo],100,Jo meets a mysterious art dealer who buys some...
3,tt0093989,6.4,[Cheh Chang],[Cheh Chang],115,The friendship between a thief turned Chinese ...
4,tt0090665,6.7,[Férid Boughedir],"[Férid Boughedir, Nouri Bouzid, Taoufik Jebali]",98,"Noura struggles to reconcile two worlds, Musli..."


In [20]:
df_info.shape

(30, 6)