In [1]:
#############
# LIBRARIES #
#############

import os

import requests
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--ignore-certificate-errors')
options.add_argument('--incognito')
options.add_argument('--headless')
driver = webdriver.Chrome(options=options)

from bs4 import BeautifulSoup as bs
import time
import bs4.element
from urllib.parse import urljoin

import pandas as pd
import string

from dotenv import load_dotenv
load_dotenv()

TMFB_API_KEY = os.getenv("TMFB_API_KEY")

Main URL -- https://kids-in-mind.com/{title_starts_with_letter}.htm (e.g. https://kids-in-mind.com/a.htm, https://kids-in-mind.com/b.htm, etc.)

...we assign each film three distinct, category-specific ratings: one for SEX & NUDITY, one for VIOLENCE & GORE and one for LANGUAGE. Each rating is on a scale of zero to ten, depending on quantity (more F-words, for instance, will mean a higher Language rating, and so on) as well as context (especially when it comes to the categories of sex, nudity, violence and gore, since they are not as easily quantifiable as profanity).

We'll collect the title, MPAA rating, year, and content ratings for each movie. <div class="et_pb_text_inner"><p>Abandon <span style="font-size:14px !important">| 2002 | PG-13 | - 4.4.4</span></p></div>



In [2]:
#####################################################
# Function to scrape the content of the KIM website #
#####################################################

def get_movie_info(letter):
    movie_info_list = []
    movie_description_list = []
    URI = f'https://kids-in-mind.com/{letter}.htm'
    response = requests.get(URI)
    # print(response.status_code)
    if response:
        print('Request successful')
        soup = bs(response.content, 'html.parser').find_all('div', class_="et_pb_text_inner")
        movie_by_title = soup[2]
        movie_by_title = movie_by_title.find_all('a')
        movie_by_title_href = [movie['href'] for movie in movie_by_title]
        movie_title = [movie.text for movie in movie_by_title]
        movie_by_title_url = [urljoin(URI, movie) for movie in movie_by_title_href]

        for movie in movie_by_title_url:
            movie_page = requests.get(movie)
            print(f'Getting info for {movie}')
            soup = bs(movie_page.content, 'html.parser')
            results = soup.find_all('div', class_="et_pb_text_inner")

            try:
                movie_info = results[1].find('p').text
                # print(movie_info)
                movie_info_list.append(movie_info)
                # print(movie_title)
                movie_description = results[2].find('p').text
                # print(movie_description)
                movie_description_list.append(movie_description)
                time.sleep(1)
                
            except:
                pass
    movies_df = pd.DataFrame(list(zip(movie_info_list, movie_description_list)), columns =['movie_info', 'movie_description'])
    movies_df[['movie_title', 'movie_year', 'movie_rating', 'KIM_ratings']] = movies_df.movie_info.str.split("|", expand=True)
    movies_df['KIM_ratings'] = movies_df['KIM_ratings'].str.strip('- ')
    movies_df[['sex_nudity', 'violence_gore', 'language']] = movies_df.KIM_ratings.str.split(".", expand=True)
    movies_df = movies_df[['movie_title', 'movie_year', 'movie_rating', 'sex_nudity', 'violence_gore', 'language', 'movie_description']]
    
    return movies_df


In [78]:
movies_df = get_movie_info('x')
movies_df

Request successful
Getting info for https://kids-in-mind.com/x/x-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/x/xfiles_1998__163.htm
Getting info for https://kids-in-mind.com/x/xfilesiwanttobelieve.htm
Getting info for https://kids-in-mind.com/x/xmen_2000.htm
Getting info for https://kids-in-mind.com/x/xmenapocalypse.htm
Getting info for https://kids-in-mind.com/x/xmendaysoffuturepast.htm
Getting info for https://kids-in-mind.com/x/xmenfirstclass.htm
Getting info for https://kids-in-mind.com/x/xmenoriginswolverine.htm
Getting info for https://kids-in-mind.com/x/xmenthelaststand.htm
Getting info for https://kids-in-mind.com/x/x2xmenunited.htm
Getting info for https://kids-in-mind.com/x/xxx.htm
Getting info for https://kids-in-mind.com/x/xxxreturnofxandercage.htm
Getting info for https://kids-in-mind.com/x/xxxstateoftheunion.htm


Unnamed: 0,movie_title,movie_year,movie_rating,sex_nudity,violence_gore,language,movie_description
0,The X-Files,1998,PG-13,1,6,3,Mulder and Scully (David Duchovny and Gillian ...
1,The X-Files: I Want to Believe,2008,PG-13,3,7,4,"Former FBI agents Fox Mulder (David Duchovny),..."
2,X-Men,2000,PG-13,1,6,3,"In the not-too-distant future, a race of mutan..."
3,X-Men: Days of Future Past,2014,PG-13,3,7,5,Wolverine (Hugh Jackman) is sent to the past t...
4,X-Men: First Class,2011,PG-13,5,7,5,"Prequel to the popular sci-fi series, tracing ..."
5,X-Men Origins: Wolverine,2009,PG-13,4,6,4,The fourth entry in the series provides the ba...
6,X-Men: The Last Stand,2006,PG-13,4,6,3,The third film in the series revolves around a...
7,X2: X-Men United,2003,PG-13,4,5,3,The X-Men return to do battle against the mega...
8,XXX: State of the Union,2005,PG-13,4,7,5,In this sequel Ice Cube picks up the role of s...


In [53]:
alphabet = list(string.ascii_lowercase)
for letter in alphabet:
    movies_df = get_movie_info(letter)
    # Saving each letter to a csv file
    movies_df.to_csv(f'KIM_movies_{letter}.csv', index=False)

# # Saving the data to a csv file
# movies_df.to_csv('KIM_movies.csv', index=False)

Success
Getting info for https://kids-in-mind.com/a/abandon.htm
Getting info for https://kids-in-mind.com/a/abduction.htm
Getting info for https://kids-in-mind.com/a/abominable-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/a/aboutaboy.htm
Getting info for https://kids-in-mind.com/a/aboutlastnight.htm
Getting info for https://kids-in-mind.com/a/aboutschmidt.htm
Getting info for https://kids-in-mind.com/a/abouttime.htm
Getting info for https://kids-in-mind.com/a/above_the_rim_1994__069.htm
Getting info for https://kids-in-mind.com/a/abrahamlincolnvampirehunter.htm
Getting info for https://kids-in-mind.com/a/absolutelyfabulousthemovie.htm
Getting info for https://kids-in-mind.com/a/absolute_power_1997__575.htm
Getting info for https://kids-in-mind.com/a/abyss-parents-guide-movie-review-rating.htm
Getting info for https://kids-in-mind.com/a/accepted.htm
Getting info for https://kids-in-mind.com/a/accountant.htm
Getting info for https://kids-in-mind.com/a/a

In [54]:
# Concatenating all the csv files
pd.concat([pd.read_csv(f'KIM_movies_{letter}.csv') for letter in alphabet]).to_csv('KIM_movies.csv', index=False)

In [3]:
def search_movie(api_key, query):
    url = f"https://api.themoviedb.org/3/search/movie?api_key={api_key}&query={query}"
    response = requests.get(url)
    data = response.json()
    return data['results']

def get_movie_details(api_key, movie_id):
    url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={api_key}"
    response = requests.get(url)
    data = response.json()
    return data


query = "The Shawshank Redemption"
search_results = search_movie(TMFB_API_KEY, query)

# Get movie details using the first result's ID
if search_results:
    movie = get_movie_details(TMFB_API_KEY, search_results[0]['id'])
    print(movie)
    # Getting the movie genre
    genre_1 = movie['genres'][0]['name']
    genre_2 = movie['genres'][1]['name']
    overview = movie['overview']
    vote_average = movie['vote_average']
    run_time = movie['runtime']
    # Movie director
    # director = movie['credits']['crew'][0]['name']
    print(genre_1)
    print(genre_2)
    print(overview)
    print(vote_average)
    print(run_time)
else:
    print("No results found.")
    

# def get_movie_info(api_key, query):
#     time.sleep(2)
#     search_results = search_movie(api_key, query)
#     if search_results:
#         try:
#             movie = get_movie_details(api_key, search_results[0]['id'])
#             genre_1 = movie['genres'][0]['name']
#             genre_2 = movie['genres'][1]['name']
#             overview = movie['overview']
#             vote_average = movie['vote_average']
#             run_time = movie['runtime']
#             return genre_1, genre_2, overview, vote_average, run_time
#         except:
#             return None
#     else:
#         return None
    




# Search for a movie



{'adult': False, 'backdrop_path': '/wPU78OPN4BYEgWYdXyg0phMee64.jpg', 'belongs_to_collection': None, 'budget': 25000000, 'genres': [{'id': 18, 'name': 'Drama'}, {'id': 80, 'name': 'Crime'}], 'homepage': '', 'id': 278, 'imdb_id': 'tt0111161', 'original_language': 'en', 'original_title': 'The Shawshank Redemption', 'overview': 'Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufresne begins a new life at the Shawshank prison, where he puts his accounting skills to work for an amoral warden. During his long stretch in prison, Dufresne comes to be admired by the other inmates -- including an older prisoner named Red -- for his integrity and unquenchable sense of hope.', 'popularity': 82.356, 'poster_path': '/hBcY0fE9pfXzvVaY4GKarweriG2.jpg', 'production_companies': [{'id': 97, 'logo_path': '/7znWcbDd4PcJzJUlJxYqAlPPykp.png', 'name': 'Castle Rock Entertainment', 'origin_country': 'US'}], 'production_countries': [{'iso_3166_1': 'US', 'name': 'Unite

In [10]:
def get_movie_info(api_key, query):
    time.sleep(2)
    search_results = search_movie(api_key, query)
    if search_results:
        try:
            movie = get_movie_details(api_key, search_results[0]['id'])
            genre_1 = movie['genres'][0]['name']
            genre_2 = movie['genres'][1]['name']
            overview = movie['overview']
            vote_average = movie['vote_average']
            run_time = movie['runtime']
            return genre_1, genre_2, overview, vote_average, run_time
        except:
            return None
    else:
        return None
    

In [36]:
info = get_movie_info(TMFB_API_KEY, "Z for Zachariah")

In [37]:
info

In [None]:
# Already done: o,p,q,r,s,t,u,v,w,x,y,z


In [45]:
for letter in ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n']:
    movies_df = pd.read_csv(f'movies_db/KIM_movies_{letter}.csv')
    movies_df['movie_title'].apply(lambda x: print(x))
    movie_info = movies_df['movie_title'].apply(lambda x: get_movie_info(TMFB_API_KEY, x))
    print(movie_info)
    # Add the movie info to the dataframe if it exists, otherwise add None
    movies_df['genre_1'] = movie_info.apply(lambda x: x[0] if x else None)
    movies_df['genre_2'] = movie_info.apply(lambda x: x[1] if x else None)
    movies_df['overview'] = movie_info.apply(lambda x: x[2] if x else None)
    movies_df['vote_average'] = movie_info.apply(lambda x: x[3] if x else None)
    movies_df['run_time'] = movie_info.apply(lambda x: x[4] if x else None)
    movies_df.to_csv(f'movies_db/KIM_movies_{letter}.csv', index=False)

Abandon 
Abduction 
About A Boy 
About Last Night 
About Schmidt 
Above the Rim 
Abraham Lincoln, Vampire Hunter 
Absolutely Fabulous: The Movie 
Absolute Power 
Accepted 
The Accountant 
Ace Ventura: Pet Detective 
Ace Ventura: When Nature Calls 
A.C.O.D. 
Across the Universe 
Action Point 
Act of Valor 
Adaptation 
Addams Family Values 
The Adjustment Bureau 
Admission 
Adrift 
Adult Beginners 
Adventureland 
The Adventures of Elmo In Grouchland 
The Adventures of Ociee Nash 
The Adventures of Pluto Nash 
The Adventures of Rocky and Bullwinkle 
The Adventures of Shark Boy & Lava Girl in 3-D 
The Adventures of the Great Mouse Detective 
The Adventures of Tintin 
Aeon Flux 
Affliction 
African Cats 
Afternoon Delight 
After the Sunset 
Against the Ropes 
Agent Cody Banks 
Agent Cody Banks 2: Destination London 
The Age of Adaline 
The Age of Innocence 
A.I.: Artificial Intelligence 
Ain't Them Bodies Saints 
Airborne 
Air Bud 
Air Force One 
Airheads 
The Air Up There 
Akeelah and the 

In [49]:
# # Concatenating all the csv files
# alphabet = list(string.ascii_lowercase)
# pd.concat([pd.read_csv(f'movies_db/movies_db/KIM_movies_{letter}.csv') for letter in alphabet]).to_csv('KIM_movies.csv', index=False)

In [78]:
# Opening the csv file
movies_df = pd.read_csv('movies_db/KIM_movies.csv')
movies_df.head()

Unnamed: 0,movie_title,movie_year,movie_rating,sex_nudity,violence_gore,language,movie_description,movie_genre,genre_1,genre_2,overview,vote_average,run_time
0,Abandon,2002,PG-13,4,4,4,Katie Holmes stars as a college student who's ...,Drama,Drama,Mystery,A psychological thriller about a senior at one...,4.728,99.0
1,Abduction,2011,PG-13,4,5,5,After a teenager (Taylor Lautner) discovers th...,Thriller,Thriller,Action,A young man sets out to uncover the truth abou...,5.869,106.0
2,About A Boy,2002,PG-13,3,3,5,"A superficial, shallow and wealthy man (Hugh G...",Drama,Drama,Comedy,"Will Freeman is a good-looking, smooth-talking...",6.655,101.0
3,About Last Night,2014,R,7,2,10,"Two couples (Keven Hart and Regina Hall, and M...",Comedy,Comedy,Drama,A man and woman meet and try to have a romanti...,6.161,113.0
4,About Schmidt,2002,R,6,3,5,Warren Schmidt (Jack Nicholson) reaches retire...,Drama,Drama,Comedy,A recently retired man embarks on a journey to...,6.826,125.0


In [79]:
movies_df.columns

Index(['movie_title', 'movie_year', 'movie_rating', 'sex_nudity',
       'violence_gore', 'language', 'movie_description', 'movie_genre',
       'genre_1', 'genre_2', 'overview', 'vote_average', 'run_time'],
      dtype='object')

In [80]:
movies_df.shape

(4692, 13)

In [74]:
# Saving cleaned up data to a csv file
movies_df.to_csv('KIM_movies.csv', index=False)

In [116]:
# Let's retrerive the movie genre for each movie in the KIM dataset
for letter in ['a']:
    movies_df = pd.read_csv(f'movies_db/KIM_movies_{letter}.csv')    
    movies_df[['genre_1', 'genre_2', 'overview', 'vote_average', 'run_time']] = movies_df['movie_title'].apply(lambda x: get_movie_info(TMFB_API_KEY, x))
    # movies_df['movie_genre'] = movies_df['movie_title'].apply(lambda x: get_movie_info(TMFB_API_KEY, x))
    movies_df.to_csv(f'movies_db/KIM_movies_{letter}.csv', index=False)

In [87]:
# # Reading the KIM movies dataset
# a_movies_df = pd.read_csv('movies_db/KIM_movies_a.csv')
# a_movies_df['genre'] = movies_df.movie_title.apply(lambda x: get_movie_genre(TMFB_API_KEY, x))

In [6]:
test_url = 'https://kids-in-mind.com/a/abominable-parents-guide-movie-review-rating.htm'

test = requests.get(test_url)
soup = bs(test.content, 'html.parser')
h1 = soup.find_all('h1')

In [42]:
h1[0].text

'Abominable | 2019 | PG | – 1.2.1'

In [65]:
hr = soup.find_all('div', class_="et_pb_text_inner")[2].findAll('hr')[0].nextSibling

hr

'\nWhen a teen (voiced by Chloe Bennet) finds a young Yeti (voiced by Joseph Izzo) on top of her home, she and her friends (voiced by Tenzing Norgay Trainor and Albert Tsai) take him on a 2,000-mile trek to find its family. Meanwhile, a billionaire (voiced by Eddie Izzard) tries to steal the Yeti and a zoologist (voiced by Sarah Paulson) pursues it as well, but none of the humans realize the magic they will experience along the way. Also with the voices of Tsai Chin and Michelle Wong. Directed by Jill Culton & Todd Wilderman. [Running Time: 1:37]'