In [1]:
#############
# LIBRARIES #
#############

import os

import requests
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

from bs4 import BeautifulSoup as bs
import time
import bs4.element
from urllib.parse import urljoin

import pandas as pd
import string

from dotenv import load_dotenv
load_dotenv()

TMFB_API_KEY = os.getenv("TMFB_API_KEY")

Main URL -- https://kids-in-mind.com/{title_starts_with_letter}.htm (e.g. https://kids-in-mind.com/a.htm, https://kids-in-mind.com/b.htm, etc.)

...we assign each film three distinct, category-specific ratings: one for SEX & NUDITY, one for VIOLENCE & GORE and one for LANGUAGE. Each rating is on a scale of zero to ten, depending on quantity (more F-words, for instance, will mean a higher Language rating, and so on) as well as context (especially when it comes to the categories of sex, nudity, violence and gore, since they are not as easily quantifiable as profanity).

We'll collect the title, MPAA rating, year, and content ratings for each movie. <div class="et_pb_text_inner"><p>Abandon <span style="font-size:14px !important">| 2002 | PG-13 | - 4.4.4</span></p></div>



In [28]:
#####################################################
# Function to scrape the content of the KIM website #
#####################################################

def get_movie_info(letter):
    movie_info_list = []
    movie_description_list = []
    URI = f'https://kids-in-mind.com/{letter}.htm'
    response = requests.get(URI)
    # print(response.status_code)
    if response:
        print('Success')
        # Getting the list of all movies --contained in class="et_pb_text_inner" in <a> tag
        # First find the class="et_pb_text_inner"
        movies = [] # all hrefs
        soup = bs(response.content, 'html.parser').find_all('div', class_="et_pb_text_inner")
        movie_by_title = soup[2]
        movie_by_title = movie_by_title.find_all('a')
        movie_by_title_href = [movie['href'] for movie in movie_by_title]
        movie_title = [movie.text for movie in movie_by_title]
        # print(movie_title)
        # Creating the URL for each movie
        movie_by_title_url = [urljoin(URI, movie) for movie in movie_by_title_href]
        # print(movie_by_title_url)
        # Opening each movie page and scraping the content
        for movie in movie_by_title_url:
            # print(movie)
            movie_page = requests.get(movie)
            # print(movie_page)
            # print(movie_page.status_code)
            print(f'Getting info for {movie}')
            soup = bs(movie_page.content, 'html.parser')
            results = soup.find_all('div', class_="et_pb_text_inner")
            # Alternatively info is in an H1 tag inside a span tag
            alternative_results = soup.find_all('h1')

            try:
                movie_info = results[1].find('p').text
                # print(movie_info)
                movie_info_list.append(movie_info)
                # print(movie_title)
                movie_description = results[2].find('p').text
                # print(movie_description)
                movie_description_list.append(movie_description)
                time.sleep(1)
                # movie_info_dict['movie_title'] = movie_title
                # movie_info_dict['movie_info'] = movie_info
                
            except:
                pass
    movies_df = pd.DataFrame(list(zip(movie_info_list, movie_description_list)), columns =['movie_info', 'movie_description'])
    movies_df[['movie_title', 'movie_year', 'movie_rating', 'KIM_ratings']] = movies_df.movie_info.str.split("|", expand=True)
    movies_df['KIM_ratings'] = movies_df['KIM_ratings'].str.strip('- ')
    movies_df[['sex_nudity', 'violence_gore', 'language']] = movies_df.KIM_ratings.str.split(".", expand=True)
    movies_df = movies_df[['movie_title', 'movie_year', 'movie_rating', 'sex_nudity', 'violence_gore', 'language', 'movie_description']]
    
    return movies_df


In [29]:
movies_df = get_movie_info('x')
movies_df

Success
Getting info for https://kids-in-mind.com/x/x-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/x/xfiles_1998__163.htm
The X-Files | 1998 | PG-13 | - 1.6.3
['X', 'X-Files, The', 'X-Files, The: I Want to Believe', 'X-Men', 'X-Men: Apocalypse', 'X-Men: Days of Future Past', 'X-Men: First Class', 'X-Men Origins: Wolverine', 'X-Men: The Last Stand', 'X2: X-Men United', 'XXX', 'xXx: Return of Xander Cage', 'XXX: State of the Union']
Getting info for https://kids-in-mind.com/x/xfilesiwanttobelieve.htm
The X-Files: I Want to Believe | 2008 | PG-13 | - 3.7.4
['X', 'X-Files, The', 'X-Files, The: I Want to Believe', 'X-Men', 'X-Men: Apocalypse', 'X-Men: Days of Future Past', 'X-Men: First Class', 'X-Men Origins: Wolverine', 'X-Men: The Last Stand', 'X2: X-Men United', 'XXX', 'xXx: Return of Xander Cage', 'XXX: State of the Union']
Getting info for https://kids-in-mind.com/x/xmen_2000.htm
X-Men | 2000 | PG-13 | - 1.6.3
['X', 'X-Files, The', 'X-Files, The: I W

Unnamed: 0,movie_title,movie_year,movie_rating,sex_nudity,violence_gore,language,movie_description
0,The X-Files,1998,PG-13,1,6,3,Mulder and Scully (David Duchovny and Gillian ...
1,The X-Files: I Want to Believe,2008,PG-13,3,7,4,"Former FBI agents Fox Mulder (David Duchovny),..."
2,X-Men,2000,PG-13,1,6,3,"In the not-too-distant future, a race of mutan..."
3,X-Men: Days of Future Past,2014,PG-13,3,7,5,Wolverine (Hugh Jackman) is sent to the past t...
4,X-Men: First Class,2011,PG-13,5,7,5,"Prequel to the popular sci-fi series, tracing ..."
5,X-Men Origins: Wolverine,2009,PG-13,4,6,4,The fourth entry in the series provides the ba...
6,X-Men: The Last Stand,2006,PG-13,4,6,3,The third film in the series revolves around a...
7,X2: X-Men United,2003,PG-13,4,5,3,The X-Men return to do battle against the mega...
8,XXX: State of the Union,2005,PG-13,4,7,5,In this sequel Ice Cube picks up the role of s...


In [53]:
alphabet = list(string.ascii_lowercase)
for letter in alphabet:
    movies_df = get_movie_info(letter)
    # Saving each letter to a csv file
    movies_df.to_csv(f'KIM_movies_{letter}.csv', index=False)

# # Saving the data to a csv file
# movies_df.to_csv('KIM_movies.csv', index=False)

Success
Getting info for https://kids-in-mind.com/a/abandon.htm
Getting info for https://kids-in-mind.com/a/abduction.htm
Getting info for https://kids-in-mind.com/a/abominable-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/a/aboutaboy.htm
Getting info for https://kids-in-mind.com/a/aboutlastnight.htm
Getting info for https://kids-in-mind.com/a/aboutschmidt.htm
Getting info for https://kids-in-mind.com/a/abouttime.htm
Getting info for https://kids-in-mind.com/a/above_the_rim_1994__069.htm
Getting info for https://kids-in-mind.com/a/abrahamlincolnvampirehunter.htm
Getting info for https://kids-in-mind.com/a/absolutelyfabulousthemovie.htm
Getting info for https://kids-in-mind.com/a/absolute_power_1997__575.htm
Getting info for https://kids-in-mind.com/a/abyss-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/a/accepted.htm
Getting info for https://kids-in-mind.com/a/accountant.htm
Getting info for https://kids-in-mind.com/a/a

In [54]:
# Concatenating all the csv files
pd.concat([pd.read_csv(f'KIM_movies_{letter}.csv') for letter in alphabet]).to_csv('KIM_movies.csv', index=False)

In [64]:
def search_movie(api_key, query):
    url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={query}"
    response = requests.get(url)
    data = response.json()
    return data['results']

def get_movie_details(api_key, movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    response = requests.get(url)
    data = response.json()
    return data



query = "The Shawshank Redemption"

# Search for a movie
search_results = search_movie(TMFB_API_KEY, query)

# Get movie details using the first result's ID
if search_results:
    movie = get_movie_details(TMFB_API_KEY, search_results[0]['id'])
    # print(movie)
    # Getting the movie genre
    genre = movie['genres'][0]['name']
    print(genre)
else:
    print("No results found.")


{'adult': False, 'backdrop_path': '/wPU78OPN4BYEgWYdXyg0phMee64.jpg', 'belongs_to_collection': None, 'budget': 25000000, 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}], 'homepage': '', 'id': 278, 'imdb_id': 'tt0111161', 'original_language': 'en', 'original_title': 'The Shawshank Redemption', 'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.', 'popularity': 82.356, 'poster_path': '/hBcY0fE9pfXzvVaY4GKarweriG2.jpg', 'production_companies': [{'id': 97, 'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png', 'name': 'Castle Rock Entertainment', 'origin_country': 'US'}], 'production_countries': [{'iso_3166_1': 'US', 'name': 'Unite

In [65]:
def get_movie_genre(api_key, query):
    search_results = search_movie(api_key, query)
    if search_results:
        movie = get_movie_details(api_key, search_results[0]['id'])
        genre = movie['genres'][0]['name']
        return genre
    else:
        return None

In [68]:
get_movie_genre(TMFB_API_KEY, "The Goonies")

'Adventure'