In [1]:
#############
# LIBRARIES #
#############

import os

import requests
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

from bs4 import BeautifulSoup as bs
import time
import bs4.element
from urllib.parse import urljoin

import pandas as pd
import string

Main URL -- https://kids-in-mind.com/{title_starts_with_letter}.htm (e.g. https://kids-in-mind.com/a.htm, https://kids-in-mind.com/b.htm, etc.)

...we assign each film three distinct, category-specific ratings: one for SEX & NUDITY, one for VIOLENCE & GORE and one for LANGUAGE. Each rating is on a scale of zero to ten, depending on quantity (more F-words, for instance, will mean a higher Language rating, and so on) as well as context (especially when it comes to the categories of sex, nudity, violence and gore, since they are not as easily quantifiable as profanity).

We'll collect the title, MPAA rating, year, and content ratings for each movie. <div class="et_pb_text_inner"><p>Abandon <span style="font-size:14px !important">| 2002 | PG-13 | - 4.4.4</span></p></div>



In [49]:
#####################################################
# Function to scrape the content of the KIM website #
#####################################################

def get_movie_info(letter):
    movie_info_list = []
    movie_description_list = []
    URI = f'https://kids-in-mind.com/{letter}.htm'
    response = requests.get(URI)
    # print(response.status_code)
    if response:
        print('Success')
        # Getting the list of all movies --contained in class="et_pb_text_inner" in <a> tag
        # First find the class="et_pb_text_inner"
        movies = [] # all hrefs
        soup = bs(response.content, 'html.parser').find_all('div', class_="et_pb_text_inner")
        movie_by_title = soup[2]
        movie_by_title = movie_by_title.find_all('a')
        movie_by_title_href = [movie['href'] for movie in movie_by_title]
        movie_title = [movie.text for movie in movie_by_title]
        # print(movie_title)
        # Creating the URL for each movie
        movie_by_title_url = [urljoin(URI, movie) for movie in movie_by_title_href]
        # print(movie_by_title_url)
        # Opening each movie page and scraping the content
        for movie in movie_by_title_url:
            # print(movie)
            movie_page = requests.get(movie)
            # print(movie_page)
            # print(movie_page.status_code)
            print(f'Getting info for {movie}')
            soup = bs(movie_page.content, 'html.parser').find_all('div', class_="et_pb_text_inner")
            try:
                movie_info = soup[1].find('p').text
                # print(movie_info)
                movie_info_list.append(movie_info)
                # print(movie_title)
                movie_description = soup[2].find('p').text
                # print(movie_description)
                movie_description_list.append(movie_description)
                time.sleep(1)
                # movie_info_dict['movie_title'] = movie_title
                # movie_info_dict['movie_info'] = movie_info
                
            except:
                pass
    movies_df = pd.DataFrame(list(zip(movie_info_list, movie_description_list)), columns =['movie_info', 'movie_description'])
    movies_df[['movie_title', 'movie_year', 'movie_rating', 'KIM_ratings']] = movies_df.movie_info.str.split("|", expand=True)
    movies_df['KIM_ratings'] = movies_df['KIM_ratings'].str.strip('- ')
    movies_df[['sex_nudity', 'violence_gore', 'language']] = movies_df.KIM_ratings.str.split(".", expand=True)
    movies_df = movies_df[['movie_title', 'movie_year', 'movie_rating', 'sex_nudity', 'violence_gore', 'language', 'movie_description']]
    
    return movies_df


In [50]:
movies_df = get_movie_info('z')
movies_df

Success
Getting info for https://kids-in-mind.com/z/zackandmirimakeaporno.htm
Getting info for https://kids-in-mind.com/z/zack-snyders-justice-league-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/z/zathura.htm
Getting info for https://kids-in-mind.com/z/zerodarkthirty.htm
Getting info for https://kids-in-mind.com/z/zero_effect_1998__247.htm
Getting info for https://kids-in-mind.com/z/zeros-and-ones-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/z/zerotheorem.htm
Getting info for https://kids-in-mind.com/z/zeus_and_roxanne_1997__111.htm
Getting info for https://kids-in-mind.com/z/zforzachariah.htm
Getting info for https://kids-in-mind.com/z/zodiac.htm
Getting info for https://kids-in-mind.com/z/zombieland.htm
Getting info for https://kids-in-mind.com/z/zombieland-double-tap-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/z/zombiestrippers.htm
Getting info for https://kids-in-mind.com/z/zoo

Unnamed: 0,movie_title,movie_year,movie_rating,sex_nudity,violence_gore,language,movie_description
0,Zack and Miri Make a Porno,2008,R,10,2,10,Two friends (Seth Rogen and Elizabeth Banks) h...
1,Zathura,2005,PG,1,5,2,"Sequel of sorts to 1995's ""Jumanji"": Two bicke..."
2,Zero Dark Thirty,2012,R,4,7,9,The events leading up to the killing of Osama ...
3,Zero Effect,1998,R,2,4,7,Mystery/comedy involving an agoraphobic P.I. (...
4,The Zero Theorem,2014,R,6,4,6,Struggling to focus on his mission to uncover ...
5,Zeus and Roxanne,1997,PG,1,1,1,A dog and a dolphin form a friendship as the h...
6,Z for Zachariah,2015,PG-13,6,4,5,"After a nuclear event destroys most of Earth, ..."
7,Zodiac,2007,R,3,8,6,Thriller based on the case files about a seria...
8,Zombieland,2009,R,4,8,8,After the planet has been overrun by a virus t...
9,Zombie Strippers,2008,R,8,10,8,A super-virus is spread to the dancers at an e...
