# Most Popular Movies on IMDB
## Import packages

In [12]:
# Import packages
from bs4 import BeautifulSoup
import requests
import time
import os
import warnings
import re
import pandas as pd
from pymongo import MongoClient

## Extract links and store pages

In [100]:
# Get the page
headers = {'User-Agent': 'Mozilla/5.0'}
url = 'https://www.imdb.com/chart/moviemeter/'
page = requests.get(url, headers=headers)
soup = BeautifulSoup(page.content, 'html.parser')

In [101]:
# Print the html structure
# print(soup.prettify())

In [102]:
# Get href link for each movie
links = soup.select('a.ipc-title-link-wrapper')
len(links)

107

In [103]:
for link in links[:100]:
    print(link['href'])

/title/tt14230458/?ref_=chtmvm_t_1
/title/tt15239678/?ref_=chtmvm_t_2
/title/tt13452446/?ref_=chtmvm_t_3
/title/tt15398776/?ref_=chtmvm_t_4
/title/tt1160419/?ref_=chtmvm_t_5
/title/tt7160372/?ref_=chtmvm_t_6
/title/tt1660648/?ref_=chtmvm_t_7
/title/tt21692408/?ref_=chtmvm_t_8
/title/tt17009710/?ref_=chtmvm_t_9
/title/tt14849194/?ref_=chtmvm_t_10
/title/tt23561236/?ref_=chtmvm_t_11
/title/tt6166392/?ref_=chtmvm_t_12
/title/tt8367814/?ref_=chtmvm_t_13
/title/tt0087182/?ref_=chtmvm_t_14
/title/tt5537002/?ref_=chtmvm_t_15
/title/tt26047818/?ref_=chtmvm_t_16
/title/tt11097384/?ref_=chtmvm_t_17
/title/tt19637052/?ref_=chtmvm_t_18
/title/tt1517268/?ref_=chtmvm_t_19
/title/tt11057302/?ref_=chtmvm_t_20
/title/tt15009428/?ref_=chtmvm_t_21
/title/tt3359350/?ref_=chtmvm_t_22
/title/tt11762114/?ref_=chtmvm_t_23
/title/tt15314262/?ref_=chtmvm_t_24
/title/tt23289160/?ref_=chtmvm_t_25
/title/tt13238346/?ref_=chtmvm_t_26
/title/tt17351924/?ref_=chtmvm_t_27
/title/tt26658104/?ref_=chtmvm_t_28
/title/tt1

In [104]:
top100_links = []

for link in links:
    new_link = 'https://www.imdb.com' + link['href']
    top100_links.append(new_link)

In [105]:
for i in top100_links:
    print(i)

https://www.imdb.com/title/tt14230458/?ref_=chtmvm_t_1
https://www.imdb.com/title/tt15239678/?ref_=chtmvm_t_2
https://www.imdb.com/title/tt13452446/?ref_=chtmvm_t_3
https://www.imdb.com/title/tt15398776/?ref_=chtmvm_t_4
https://www.imdb.com/title/tt1160419/?ref_=chtmvm_t_5
https://www.imdb.com/title/tt7160372/?ref_=chtmvm_t_6
https://www.imdb.com/title/tt1660648/?ref_=chtmvm_t_7
https://www.imdb.com/title/tt21692408/?ref_=chtmvm_t_8
https://www.imdb.com/title/tt17009710/?ref_=chtmvm_t_9
https://www.imdb.com/title/tt14849194/?ref_=chtmvm_t_10
https://www.imdb.com/title/tt23561236/?ref_=chtmvm_t_11
https://www.imdb.com/title/tt6166392/?ref_=chtmvm_t_12
https://www.imdb.com/title/tt8367814/?ref_=chtmvm_t_13
https://www.imdb.com/title/tt0087182/?ref_=chtmvm_t_14
https://www.imdb.com/title/tt5537002/?ref_=chtmvm_t_15
https://www.imdb.com/title/tt26047818/?ref_=chtmvm_t_16
https://www.imdb.com/title/tt11097384/?ref_=chtmvm_t_17
https://www.imdb.com/title/tt19637052/?ref_=chtmvm_t_18
https://

In [109]:
# Use loop to save all html files
for link in top100_links[:100]:
    
    # Pause between two requests
    time.sleep(5)
    
    # Use 'requests' to fetch the listing page
    headers = {'User-Agent': 'Mozilla/5.0'}
    response = requests.get(link, headers = headers)
    
    # Read the content of the html
    soup = BeautifulSoup(response.content, 'html.parser')
    
    match = re.search(r'tt(\d+)/', link)
    imdb_id = match.group(1)
    
    # Name the html file
    file_name = f"../Final Project/top100_popular/{imdb_id}.html"
    
    # Write the content to html file
    with open(file_name, 'w', encoding='utf-8') as file:
        file.write(soup.prettify())

## Process html pages to extract information

In [2]:
# Set work directory
directory = '../Final Project/top100_popular'

In [3]:
# Create a list
movies_list = []

# Loop through each file in the directory
for filename in os.listdir(directory):
    
    # Construct the full file path
    filepath = os.path.join(directory, filename)

    # Read file to string
    with open(filepath, 'r', encoding='utf-8', errors='ignore') as file:
        html = file.read()

    # Use BeautifulSoup to parse the file content
    soup = BeautifulSoup(html, 'html.parser')

    # Create a dictionary to store movie information
    movie_info = {}
    
    # ID
    match = re.search(r'(\d+)\.html', filename)
    imdb_id = match.group(1)
    movie_info['id'] = imdb_id
    
    # Title
    titles = soup.select('h1 > span.hero__primary-text')
    for title in titles:
        movie_info['Title'] = title.text.strip()

    # Infos
    info = soup.select('a.ipc-link.ipc-link--baseAlt.ipc-link--inherit-color')
    if len(info) > 18:
        movie_info['Year'] = info[5].text.strip()
        movie_info['Rating'] = info[6].text.strip()
    elif len(info) > 17:
        movie_info['Year'] = None
        movie_info['Rating'] = info[5].text.strip()
    else:
        movie_info['Year'] = None
        movie_info['Rating'] = None
    
    # Rating
    scores = soup.select_one('span.sc-bde20123-1.cMEQkK')
    if scores is not None:
        movie_info['Rating score'] = scores.text.strip()
    else:
        movie_info['Rating score'] = None

    # Number of raters
    num_scores = soup.select_one('div.sc-bde20123-3.gPVQxL')
    if num_scores is not None:
        movie_info['Number of raters'] = num_scores.text.strip()
    else:
        movie_info['Number of raters'] = None

    # Popularity
    popularity = soup.select_one('div.sc-5f7fb5b4-1.fTREEx')
    if popularity is not None:
        movie_info['Popularity rank'] = popularity.text.strip()
    else:
        movie_info['Popularity rank'] = None
   
    # Genre
    genre = soup.select('span.ipc-chip__text')
    if len(genre) == 4:
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = genre[1].text.strip()
        movie_info['Genre_3'] = genre[2].text.strip()
    elif len(genre) == 3: 
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = genre[1].text.strip()
        movie_info['Genre_3'] = None
    elif len(genre) == 2:   
        movie_info['Genre_1'] = genre[0].text.strip()
        movie_info['Genre_2'] = None
        movie_info['Genre_3'] = None
    else:
        movie_info['Genre_1'] = None
        movie_info['Genre_2'] = None
        movie_info['Genre_3'] = None
    
    # Introduction
    intro = soup.select('span.sc-466bb6c-2.chnFO')
    for i in intro:
        movie_info['Introduction'] = i.text.strip()

    # User reviews
    user_review = soup.select('span.three-Elements > span.score')
    if len(user_review) > 2:
        movie_info['User reviews'] = user_review[0].text.strip()
        movie_info['Critic reviews'] = user_review[1].text.strip()
        movie_info['Metascore'] = user_review[2].text.strip()
    else:
        movie_info['User reviews'] = None
        movie_info['Critic reviews'] = None
        movie_info['Metascore'] = None

    # Append movie_info dictionary to movies_info list
    movies_list.append(movie_info)

In [4]:
# Print the dictionary
for movie_info in movies_list:
    print(movie_info)

{'id': '14856980', 'Title': 'Atlas', 'Year': None, 'Rating': None, 'Rating score': None, 'Number of raters': None, 'Popularity rank': '39', 'Genre_1': 'Action', 'Genre_2': 'Adventure', 'Genre_3': 'Drama', 'Introduction': 'A bleak-sounding future, where an AI soldier has determined that the only way to end war is to end humanity.', 'User reviews': None, 'Critic reviews': None, 'Metascore': None}
{'id': '22022452', 'Title': 'Inside Out 2', 'Year': None, 'Rating': None, 'Rating score': None, 'Number of raters': None, 'Popularity rank': '57', 'Genre_1': 'Animation', 'Genre_2': 'Adventure', 'Genre_3': 'Comedy', 'Introduction': 'Follow Riley, in her teenage years, encountering new emotions.', 'User reviews': None, 'Critic reviews': None, 'Metascore': None}
{'id': '17279496', 'Title': 'Civil War', 'Year': None, 'Rating': 'R', 'Rating score': '7.4', 'Number of raters': '192', 'Popularity rank': '54', 'Genre_1': 'Action', 'Genre_2': None, 'Genre_3': None, 'Introduction': 'A journey across a dys

## Create Table and Save the File

In [10]:
# Change the list to dataframe format
df = pd.DataFrame(movies_list)

# Print the dataframe
df.head(5)

Unnamed: 0,id,Title,Year,Rating,Rating score,Number of raters,Popularity rank,Genre_1,Genre_2,Genre_3,Introduction,User reviews,Critic reviews,Metascore
0,14856980,Atlas,,,,,39,Action,Adventure,Drama,"A bleak-sounding future, where an AI soldier h...",,,
1,22022452,Inside Out 2,,,,,57,Animation,Adventure,Comedy,"Follow Riley, in her teenage years, encounteri...",,,
2,17279496,Civil War,,R,7.4,192,54,Action,,,"A journey across a dystopian future America, f...",,,
3,16968450,The Wonderful Story of Henry Sugar,,PG,7.4,66K,84,Short,Adventure,Comedy,"Chronicles a variety of stories, but the main ...",150.0,82.0,85.0
4,14539740,Godzilla x Kong: The New Empire,,PG-13,,,36,Action,Adventure,Sci-Fi,"Two ancient titans, Godzilla and Kong, clash i...",,,


In [11]:
# Save it as csv file
df.to_csv('top100_popular.csv', index=False)

## Connect to MongoDB Database

In [13]:
# Connect to local MongoDB instance
mo_c = MongoClient()
client = MongoClient('localhost', 27017)

In [15]:
# Create a database named 'msba'
db = client["IMDB"]

# Create a collection
collection = db["top100_popular"]

# Insert documents to collection
for movie in movies_list:
    collection.insert_one(movie)

In [16]:
# Print the collection
for document in collection.find():
    print(document)

{'_id': ObjectId('65f9e338dbb0490c671cd47f'), 'id': '14856980', 'Title': 'Atlas', 'Year': None, 'Rating': None, 'Rating score': None, 'Number of raters': None, 'Popularity rank': '39', 'Genre_1': 'Action', 'Genre_2': 'Adventure', 'Genre_3': 'Drama', 'Introduction': 'A bleak-sounding future, where an AI soldier has determined that the only way to end war is to end humanity.', 'User reviews': None, 'Critic reviews': None, 'Metascore': None}
{'_id': ObjectId('65f9e8aadbb0490c671cd480'), 'id': '22022452', 'Title': 'Inside Out 2', 'Year': None, 'Rating': None, 'Rating score': None, 'Number of raters': None, 'Popularity rank': '57', 'Genre_1': 'Animation', 'Genre_2': 'Adventure', 'Genre_3': 'Comedy', 'Introduction': 'Follow Riley, in her teenage years, encountering new emotions.', 'User reviews': None, 'Critic reviews': None, 'Metascore': None}
{'_id': ObjectId('65f9e8aadbb0490c671cd481'), 'id': '17279496', 'Title': 'Civil War', 'Year': None, 'Rating': 'R', 'Rating score': '7.4', 'Number of 