# Webscaping - downloading movie subtitles
### Szabolcs Márton Vetési
#### The code below is based on a subtitle downloader program which is used for my thesis about Movie analysis with NLP techniques. <br>The complete program can be found at https://github.com/vetszabolcs/movie_analysis/tree/main.<br>

#### The original dataset can be found at https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset.


In [1]:
import os
from os.path import join
from urllib.parse import urlencode
from requests import get
from bs4 import BeautifulSoup
import re
from sqlalchemy import create_engine
from sqlalchemy import text
import pandas as pd
import re
import sys
from time import sleep
from random import randint
from zipfile import ZipFile
from zipfile import BadZipfile
import shutil

### Constants

In [2]:
# Connections
URL = "https://yifysubtitles.org"  # subtitle website
SQL_CON = f"postgresql://postgres:{os.environ.get('SQL_PASS')}@localhost:5432/postgres"

# Locations
DATA_DIR = "./data"
SUBTITLES_DIR = join(DATA_DIR, "subtitles")
TEMP_DIR = join(SUBTITLES_DIR, "temp")

# Downloader
forbidden_chars = "[\\\\/:\\*\\?\"<>\\|]"  # characters that are not allowed in file names
header = {  # request header (~mimics a human made request)
    "authority": "yifysubtitles.org",
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-language": "hu,hu-HU;q=0.9,kn-IN;q=0.8,kn;q=0.7,en-US;q=0.6,en;q=0.5",
    "referer": "https://yifysubtitles.org/movie-imdb/tt7286456",
    "sec-ch-ua": "\".Not/A)Brand\";v=\"99\",\"Google Chrome\";v=\"103\",\"Chromium\";v=\"103\"",
    "sec-ch-ua-mobile": "?0",
    "sec-ch-ua-platform": "\"Windows\"",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36"
}

### Creating folders for the downloaded files

In [3]:
def create_folders():
    dirs = [DATA_DIR, SUBTITLES_DIR, TEMP_DIR]
    for d in dirs:
        if not os.path.exists(d):
            os.mkdir(d)

            
create_folders()

### Downloader functions

In [4]:
def get_movie_site(title):
    search = URL + "/search?" + urlencode({"q": title})
    soup = BeautifulSoup(get(search, headers=header).text, features="lxml")
    body = str(soup.find("div", class_="media-body"))
    movie_endp = re.search("href=\"(.*)\"", body)[1]  # finding the endpoint of the movie's site
    return URL + movie_endp


movie_site = get_movie_site("Joker (2019)")
movie_site

'https://yifysubtitles.org/movie-imdb/tt7286456'

In [5]:
# Get title AND download site if it's possible - if not then skip the actual movie since title validation is impossible
def get_title_and_download_site(movie_site) -> tuple:
    try:
        soup = BeautifulSoup(get(movie_site, headers=header).text, features="lxml")
        title = soup.find(class_="movie-main-title").text
        table = soup.find(class_="table other-subs")
        sub_link = re.search("\"/subtitle.*-english-yify.*?\"", str(table))[0].replace("\"", "")
        download_site = URL + sub_link
        return title, download_site
    except (TypeError, AttributeError):
        return None, None

    
title, download_site = get_title_and_download_site(movie_site)
    
print(f"Title: {title} Download site: {download_site}")

Title: Joker (2019) Download site: https://yifysubtitles.org/subtitles/joker-2019-english-yify-2599


In [6]:
# Get the link of the download button
def get_download_link(download_site):
    try:
        soup = BeautifulSoup(get(download_site, headers=header).text, features="html.parser")
        download_endp = soup.find(class_="download-subtitle").get("href")
        download_link = URL + download_endp
        return download_link
    except TypeError:
        return None
    
    
download_link = get_download_link(download_site)
download_link  # It's a zip file so decompressing is needed

'https://yifysubtitles.org/subtitle/joker-2019-english-yify-2599.zip'

In [7]:
# Downloading the zip file

download_path = os.path.join(TEMP_DIR, title + ".zip")

def download_file(url, download_path):
    r = get(url, headers=header)
    with open(download_path, 'wb') as out:
        out.write(r.content)
        
        
download_file(download_link, download_path)
print(f"File(s) in TEMP_DIR: {os.listdir(TEMP_DIR)}")

File(s) in TEMP_DIR: ['Joker (2019).zip']


### Processing the zip file

In [8]:
# Extracting the subtitle file
def extractor(zip_name, dest):
    re.sub(forbidden_chars, " ", zip_name).strip()
    # Possible extensions of a subtitle
    extensions = ("ass", "mkv", "mmc", "mpl2", "sami", "smi", "sbv", "scc", "srt", "ssa", "stl", "sub", "txt", "xml")
    with ZipFile(zip_name, "r") as zipf:
        files = zipf.namelist()
        for f in files:
            if f.endswith(extensions):
                zipf.extract(f, dest)
                break
                
                
extractor(download_path, TEMP_DIR)
print(f"File(s) in TEMP_DIR: {os.listdir(TEMP_DIR)}")

File(s) in TEMP_DIR: ['Joker (2019).zip', 'Joker.2019.720p.WEBRip.x264-[YTS.LT]-English.srt']


In [9]:
# Renaming the extracted subtitle
def renamer(zip_name, dest, temp_dir):
    extracted = [f for f in os.listdir(temp_dir) if not f.endswith(".zip")][0]  # glob does not recognize leading dot
    extension = "." + extracted.split(".")[-1]
    new_name = re.sub(".zip$", extension, zip_name.split("\\")[-1])
    new_name = re.sub(forbidden_chars, " ", new_name).strip()
    new_name = os.path.join(dest, new_name)
    if not os.path.exists(new_name):
        os.rename(os.path.join(temp_dir, extracted), new_name)
        sleep(1)
        
renamer(download_path, SUBTITLES_DIR, TEMP_DIR)
print(f"File(s) in SUBTITLES_DIR: {os.listdir(SUBTITLES_DIR)}")

File(s) in SUBTITLES_DIR: ['A Little Water (2019).srt', 'Asbury Park  Riot, Redemption, Rock & Roll (2019).srt', 'Full Count (2019).srt', 'Joker (2019).srt', 'Khalid  Free Spirit (2019).srt', 'No Ordinary Love (2019).srt', 'Running with the Devil (2019).srt', 'Ruta Madre (2019).srt', 'See You Yesterday (2019).srt', 'temp', 'The Soul Collector (2019).srt']


In [10]:
# Cleaning the temp folder
def cleanup(temp_dir):
    shutil.rmtree(temp_dir)
    os.mkdir(temp_dir)
    
cleanup(TEMP_DIR)
print(f"File(s) in TEMP_DIR: {os.listdir(TEMP_DIR)}")
print(f"File(s) in SUBTITLES_DIR: {os.listdir(SUBTITLES_DIR)}")

File(s) in TEMP_DIR: []
File(s) in SUBTITLES_DIR: ['A Little Water (2019).srt', 'Asbury Park  Riot, Redemption, Rock & Roll (2019).srt', 'Full Count (2019).srt', 'Joker (2019).srt', 'Khalid  Free Spirit (2019).srt', 'No Ordinary Love (2019).srt', 'Running with the Devil (2019).srt', 'Ruta Madre (2019).srt', 'See You Yesterday (2019).srt', 'temp', 'The Soul Collector (2019).srt']


### Database connection
#### More on the sql part: [SQL integration with Python](notebooks/sql_integration_with_python.html)

In [11]:
engine = create_engine(SQL_CON)

pd.read_sql(text('select * from movies.searched where "startYear" = 2019 limit 10'), SQL_CON)

Unnamed: 0,original_title_year,primary_title_year,startYear,searched,downloaded
0,Professor Wall im Bordell (2019),Professor Wall im Bordell (2019),2019,0,0
1,Loopers: The Caddie's Long Walk (2019),Loopers: The Caddie's Long Walk (2019),2019,1,0
2,The Lion King (2019),The Lion King (2019),2019,0,0
3,Skin in the Game (2019),Skin in the Game (2019),2019,0,0
4,Robbed (2019),Robbed (2019),2019,0,0
5,37 Seconds (2019),37 Seconds (2019),2019,0,0
6,The Runaways (2019),The Runaways (2019),2019,0,0
7,Eve (2019),Eve (2019),2019,0,0
8,A Shaun the Sheep Movie: Farmageddon (2019),A Shaun the Sheep Movie: Farmageddon (2019),2019,0,0
9,Celluloid (2019),Celluloid (2019),2019,0,0


In [12]:
def slice_sql(year, limit=None):
    """Getting titles to be searched"""
    if limit:
        query = text(f'select original_title_year, primary_title_year, "startYear"\
                         from movies.searched\
                         where searched = 0 and "startYear" = {year}\
                         order by "startYear" desc\
                         limit {limit}')
    else:
        query = text(f'select original_title_year, primary_title_year, "startYear"\
                         from movies.searched\
                         where searched = 0 and "startYear" = {year}\
                         order by "startYear" desc')
    return pd.read_sql(query, SQL_CON)


def check_download_count(year):
    query = text(f'select count(*)\
                     from movies.searched\
                     where downloaded = 1\
                     and "startYear" = {year}')
    res = engine.execute(query)
    count = [x[0] for x in res][0]
    return count


def update_searched(cond_val):
    query = f'update movies.searched\
                set searched = 1\
                where original_title_year = \'{cond_val}\''
    engine.execute(text(query))


def update_downloaded(cond_val):
    query = f'update movies.searched\
                set downloaded = 1\
                 where original_title_year = \'{cond_val}\''
    engine.execute(text(query))

### Combining everything

In [13]:
df = slice_sql(2019, limit=10)[["original_title_year", "primary_title_year"]]
df

Unnamed: 0,original_title_year,primary_title_year
0,Professor Wall im Bordell (2019),Professor Wall im Bordell (2019)
1,The Lion King (2019),The Lion King (2019)
2,Skin in the Game (2019),Skin in the Game (2019)
3,Robbed (2019),Robbed (2019)
4,37 Seconds (2019),37 Seconds (2019)
5,The Runaways (2019),The Runaways (2019)
6,Eve (2019),Eve (2019)
7,A Shaun the Sheep Movie: Farmageddon (2019),A Shaun the Sheep Movie: Farmageddon (2019)
8,Celluloid (2019),Celluloid (2019)
9,Dolorosa Gioia (2019),Dolorosa Gioia (2019)


In [14]:
#  Note: if the searched title was not found but the engine has other results I download the subtitle anyway
# (The purpose is to collect subtitles and their corresponding year not finding the subtitle of a specific movie)
for o, p in df.values:
    print(f"Searching {o}")
    sleep(randint(1, 4))  # wait some secs between searches to avoid overloading the server and mimic human behavior
    
    cond_val = o.replace("\'", "\'\'")  # reformat to sql readable
    update_searched(cond_val)  # update the db's searched column
    
    try:
        movie_site = get_movie_site(o)  # first try to find with the original title
    except TypeError:
        try:
            movie_site = get_movie_site(p)  # if it fails try with the primary one
        except TypeError:
            continue
    title, download_site = get_title_and_download_site(movie_site)
    print(f"Found subtitle for {o}")
    
    if download_site:  # if the download site is valid get the download link
        title = re.sub(forbidden_chars, " ", title).strip()
        print("Getting download link...")
        download_link = get_download_link(download_site)
    else:
        continue
        
    if download_link:  # If the download link is valid try to download the subtitle
        download_path = join(TEMP_DIR, title + ".zip")
        try:
            print("Downloading subtitle...")
            download_file(download_link, download_path)
            extractor(download_path, TEMP_DIR)
            renamer(download_path, SUBTITLES_DIR, TEMP_DIR)
            if title.lower() == p.lower() or title.lower() == o.lower():
                update_downloaded(cond_val)
                print("Updated download column")
            print(f"Downloaded - {title}")
        except (TypeError, IndexError, BadZipfile):
            continue

Searching Professor Wall im Bordell (2019)
Searching The Lion King (2019)
Found subtitle for The Lion King (2019)
Getting download link...
Downloading subtitle...
Updated download column
Downloaded - The Lion King (2019)
Searching Skin in the Game (2019)
Found subtitle for Skin in the Game (2019)
Searching Robbed (2019)
Searching 37 Seconds (2019)
Searching The Runaways (2019)
Found subtitle for The Runaways (2019)
Getting download link...
Downloading subtitle...
Updated download column
Downloaded - The Runaways (2019)
Searching Eve (2019)
Found subtitle for Eve (2019)
Getting download link...
Downloading subtitle...
Updated download column
Downloaded - Eve (2019)
Searching A Shaun the Sheep Movie: Farmageddon (2019)
Found subtitle for A Shaun the Sheep Movie: Farmageddon (2019)
Getting download link...
Downloading subtitle...
Downloaded - A Shaun the Sheep Movie  Farmageddon (2019)
Searching Celluloid (2019)
Searching Dolorosa Gioia (2019)


### Verifying that the database is updated

In [15]:
pd.read_sql(text(
    "select * from movies.searched\
    where original_title_year = 'The Lion King (2019)'\
    or original_title_year = 'Dolorosa Gioia (2019)'"), SQL_CON)

Unnamed: 0,original_title_year,primary_title_year,startYear,searched,downloaded
0,Dolorosa Gioia (2019),Dolorosa Gioia (2019),2019,1,0
1,The Lion King (2019),The Lion King (2019),2019,1,1
