# Beautiful Soup for Web Scraping

Beginner Friendly. Useful for pulling data out of HTML and XML files.

In [1]:
import requests
from bs4 import BeautifulSoup as Soup

## Getting A Response

In [32]:
root = 'https://subslikescript.com'

In [None]:
script_url = f'{root}/movie/Titanic-120338' #url
result = requests.get(script_url) # GET request handler
content = result.text # result object
print(result) #response code

## Printing The Response Text

In [None]:
soup = Soup(content, 'lxml') # Markup language text parser
print(soup.prettify()) # pretiffied tree structure of result

## Finding A Single Element

In [12]:
box = soup.find('article', class_='main-article')

In [None]:
title = box.find('h1').get_text()
title

In [None]:
transcript = box.find('div', class_='full-script').get_text(strip=True, separator=' ')
transcript

## Saving Outputs to File

In [None]:
with open(f'data/scripts/{title}.txt', 'w') as file:
    file.write(transcript)

## Scraping Multiple Links


In [None]:
url = f'{root}/movies' #url
result = requests.get(url) # GET request handler
content = result.text # result object
print(result) #response code

In [None]:
soup = Soup(content, 'lxml') # Markup language text parser
print(soup.prettify()) # pretiffied tree structure of result

In [37]:
box = soup.find('article', class_='main-article')

links = []
for link in box.find_all('a', href=True):
    links.append(link['href'])

In [None]:
print(links)

In [53]:
# to get extract the box from any page
def get_box(url):
    # getting the html
    box = Soup(
        requests.get(url).text, 'lxml'
    ).find('article', 
        class_='main-article'
    )
    return box

# to extract transcripts from movie pages 
def save_transcripts(box):
    # extracting specific elements
    title = box.find('h1').get_text()
    transcript = box.find(
        'div', class_='full-script'
    ).get_text(strip=True, separator=' ')

    # saving to a file
    with open(f'data/scripts/{title}.txt', 'w') as file:
        file.write(transcript)

In [None]:
for link in links:
    url = f'{root}{link}'
    movie_box = get_box(url)
    print(movie_box.find('h1').get_text())
    # save_transcripts(movie_box)

## Pagination

In [None]:
pagination = soup.find('ul', class_='pagination')
pages = pagination.find_all('li', class_='page-item')
last_page = pages[-2].text

In [None]:
for page_num in range(2, int(last_page)+1):
    page_url = f'{root}/movies?page={page_num}'
    list_box = get_box(page_url)
    links = [link['href'] for link in list_box.find_all('a', href=True)]
    for link in links:
        movie_url = f'{root}{link}'
        movie_box = get_box(movie_url)
        print(movie_box.find('h1').get_text())