**Data Gathering**

**Example of gathering image data using webcam**

Note: Run this snippet using local jupyter notebook

In [2]:
import cv2
from google.colab.patches import cv2_imshow
key = cv2. waitKey(1)
webcam = cv2.VideoCapture(0)
while True:
    try:
        check, frame = webcam.read()
        print(check) #prints true as long as the webcam is running
        print(frame) #prints matrix values of each framecd
        cv2.imshow("Capturing", frame)
        key = cv2.waitKey(1)
        if key == ord('s'):
            cv2.imwrite(filename='saved_img.jpg', img=frame)
            webcam.release()
            img_new = cv2.imread('saved_img.jpg', cv2.IMREAD_GRAYSCALE)
            img_new = cv2.imshow("Captured Image", img_new)
            cv2.waitKey(1650)
            cv2.destroyAllWindows()
            print("Processing image...")
            img_ = cv2.imread('saved_img.jpg', cv2.IMREAD_ANYCOLOR)
            print("Converting RGB image to grayscale...")
            gray = cv2.cvtColor(img_, cv2.COLOR_BGR2GRAY)
            print("Converted RGB image to grayscale...")
            print("Resizing image to 28x28 scale...")
            img_ = cv2.resize(gray,(28,28))
            print("Resized...")
            img_resized = cv2.imwrite(filename='saved_img-final.jpg', img=img_)
            print("Image saved!")

            break
        elif key == ord('q'):
            print("Turning off camera.")
            webcam.release()
            print("Camera off.")
            print("Program ended.")
            cv2.destroyAllWindows()
            break

    except(KeyboardInterrupt):
        print("Turning off camera.")
        webcam.release()
        print("Camera off.")
        print("Program ended.")
        cv2.destroyAllWindows()

ModuleNotFoundError: No module named 'cv2'

**Example of gathering voice data using microphone**

In [3]:
!pip3 install sounddevice



In [4]:
!pip3 install wavio



In [5]:
!pip3 install scipy



In [6]:
!apt-get install libportaudio2

'apt-get' is not recognized as an internal or external command,
operable program or batch file.


In [7]:
# import required libraries
import sounddevice as sd
from scipy.io.wavfile import write
import wavio as wv

# Sampling frequency
freq = 44100

# Recording duration
duration = 5

# Start recorder with the given values
# of duration and sample frequency
recording = sd.rec(int(duration * freq),
                   samplerate=freq, channels=2)

# Record audio for the given number of seconds
sd.wait()

# This will convert the NumPy array to an audio
# file with the given sampling frequency
write("recording0.wav", freq, recording)

# Convert the NumPy array to audio file
wv.write("recording1.wav", recording, freq, sampwidth=2)

**Web Scraping**

Web scraping, web harvesting, or web data extraction is data scraping used for extracting data from websites. The web scraping software may directly access the World
Wide Web using the Hypertext Transfer Protocol or a web browser. While web scraping can be done manually by a software user, the term typically refers to automated
processes implemented using a bot or web crawler. It is a form of copying in which specific data is gathered and copied from the web, typically into a central local database or
spreadsheet, for later retrieval or analysis.

**Image Scraping using BeautifulSoup and Request**

In [8]:
!pip install bs4



In [9]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [10]:
import requests
from bs4 import BeautifulSoup

def getdata(url):
  r = requests.get(url)
  return r.text

htmldata = getdata("https://www.google.com/")
soup = BeautifulSoup(htmldata, 'html.parser')
for item in soup.find_all('img'):
  print(item['src'])

/images/branding/googlelogo/1x/googlelogo_white_background_color_272x92dp.png


In [11]:
pip install selenium

Note: you may need to restart the kernel to use updated packages.


**Image Scraping using Selenium**

In [115]:
!pip install selenium
import sys
sys.path.insert(0,'/usr/lib/chromium-browser/chromedriver')


from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import requests
import shutil
import os
import getpass
import urllib.request
import io
import time
from PIL import Image

user = getpass.getuser()
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')

driver = webdriver.Chrome()

def scroll_to_end(driver):
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    time.sleep(5)#sleep_between_interactions

    
def getImageUrls(name,totalImgs,driver):
    search_url = "https://www.google.com/search?q=cat&tbm=isch&ved=2ahUKEwjNn_Gn7YyFAxU3yDgGHQYQCesQ2-cCegQIABAA&oq=cat&gs_lp=EgNpbWciA2NhdDINEAAYgAQYigUYQxixAzIIEAAYgAQYsQMyDhAAGIAEGIoFGLEDGIMBMggQABiABBixAzILEAAYgAQYsQMYgwEyCBAAGIAEGLEDMggQABiABBixAzIFEAAYgAQyCBAAGIAEGLEDMggQABiABBixA0iqGVCADlilF3AAeACQAQCYAVegAc0CqgEBNLgBA8gBAPgBAYoCC2d3cy13aXotaW1nwgIKEAAYgAQYigUYQ4gGAQ&sclient=img&ei=WBYAZs2TMLeQ4-EPhqCk2A4&bih=568&biw=1251&hl=en"
    driver.get(search_url)
    img_urls = set()
    img_count = 0
    results_start = 0
    
    while(img_count+results_start<totalImgs): #Extract actual images now
        scroll_to_end(driver)
        totalResults = driver.find_elements(By.CLASS_NAME,"Q4LuWd")       
        print('total results:', len(totalResults))
        print(f"Found: {totalResults} search results. Extracting links from{results_start}:{totalResults}")
        for img in totalResults[results_start:totalImgs]:
            img.click()
            time.sleep(5)
            image = driver.find_element(By.CLASS_NAME,'iPVvYb')
            img_urls.add(image.get_attribute('src'))
            print(img_urls)
            img_count=len(img_urls)
            print(img_count)
                    
    return img_urls

def downloadImages(folder_path,file_name,url):
    try:
        image_content = requests.get(url).content
    except Exception as e:
        print(f"ERROR - COULD NOT DOWNLOAD {url} - {e}")
    try:
        image_file = io.BytesIO(image_content)
        image = Image.open(image_file).convert('RGB')
        file_path = os.path.join(folder_path, file_name)
        with open(file_path, 'wb') as f:
            image.save(f, "JPEG", quality=85)
        print(f"SAVED - {url} - AT: {file_path}")
    except Exception as e:
        print(f"ERROR - COULD NOT SAVE {url} - {e}")
        
def saveInDestFolder(searchNames,destDir,totalImgs,driver):
    for name in list(searchNames):
        path=os.path.join(destDir,name)
        if not os.path.isdir(path):
            os.mkdir(path)
        print('Current Path',path)
        totalLinks=getImageUrls(name,totalImgs,driver)
        print('totalLinks',totalLinks)
        
    if totalLinks is None:
        print('images not found for :',name)
        
    else:
        for i, link in enumerate(totalLinks):
            file_name = f"{i:150}.jpg"
            downloadImages(path,file_name,link)
            
searchNames=['cat']
destDir=f'C:/Users/apuyan/Desktop/HOA 7.2 Webscraping using BeautifulSoup and Requests'
totalImgs=5

saveInDestFolder(searchNames,destDir,totalImgs,driver)

Current Path C:/Users/apuyan/Desktop/HOA 7.2 Webscraping using BeautifulSoup and Requests\cat
total results: 100
Found: [<selenium.webdriver.remote.webelement.WebElement (session="49eb72a05162c1dc011de4203b08786f", element="f.8DE2873D4CF6E70B1A6CF5649BC6F9F3.d.CA0BD5533733909361D2275A54F968E1.e.10")>, <selenium.webdriver.remote.webelement.WebElement (session="49eb72a05162c1dc011de4203b08786f", element="f.8DE2873D4CF6E70B1A6CF5649BC6F9F3.d.CA0BD5533733909361D2275A54F968E1.e.12")>, <selenium.webdriver.remote.webelement.WebElement (session="49eb72a05162c1dc011de4203b08786f", element="f.8DE2873D4CF6E70B1A6CF5649BC6F9F3.d.CA0BD5533733909361D2275A54F968E1.e.14")>, <selenium.webdriver.remote.webelement.WebElement (session="49eb72a05162c1dc011de4203b08786f", element="f.8DE2873D4CF6E70B1A6CF5649BC6F9F3.d.CA0BD5533733909361D2275A54F968E1.e.16")>, <selenium.webdriver.remote.webelement.WebElement (session="49eb72a05162c1dc011de4203b08786f", element="f.8DE2873D4CF6E70B1A6CF5649BC6F9F3.d.CA0BD553373

**Web Scraping of Movies Information using BeautifulSoup**

We want to analyze the distributions of IMDB and Metacritic movie ratings to see if we find anything interesting. To do this, we’ll first scrape data for over 2000 movies.

In [13]:
from requests import get
url = 'https://www.imdb.com/search/title/?release_date=2017-01-01,2017-12-31&sort=num_votes,desc'
agent = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36'}
response = get(url,headers=agent)
print(response.text[:500])

<!DOCTYPE html><html lang="en-US" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><script>window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
             


In [14]:
from bs4 import BeautifulSoup
html_soup = BeautifulSoup(response.text, 'html.parser')
headers = {'Accept-Language': 'en-US,en;q=0.8'}
type(html_soup)

bs4.BeautifulSoup

In [15]:
movie_containers = html_soup.find_all('div', class_ = 'sc-ab6fa25a-3 bVYfLY dli-parent')
print(type(movie_containers))
print(len(movie_containers))

<class 'bs4.element.ResultSet'>
50


**First Movie**

Extracting the data for a single movie

We can access the first container, which contains information about a single movie, by using list notation on movie_containers.

In [16]:
first_movie = movie_containers[0]
first_movie

<div class="sc-ab6fa25a-3 bVYfLY dli-parent"><div class="sc-ab6fa25a-2 gOsifL"><div class="sc-e5a25b0f-0 jQjDIb dli-poster-container"><div class="ipc-poster ipc-poster--base ipc-poster--dynamic-width ipc-sub-grid-item ipc-sub-grid-item--span-2" role="group"><div aria-label="add to watchlist" class="ipc-watchlist-ribbon ipc-focusable ipc-watchlist-ribbon--s ipc-watchlist-ribbon--base ipc-watchlist-ribbon--loading ipc-watchlist-ribbon--onImage ipc-poster__watchlist-ribbon" role="button" tabindex="0"><svg class="ipc-watchlist-ribbon__bg" height="34px" role="presentation" viewbox="0 0 24 34" width="24px" xmlns="http://www.w3.org/2000/svg"><polygon class="ipc-watchlist-ribbon__bg-ribbon" fill="#000000" points="24 0 0 0 0 32 12.2436611 26.2926049 24 31.7728343"></polygon><polygon class="ipc-watchlist-ribbon__bg-hover" points="24 0 0 0 0 32 12.2436611 26.2926049 24 31.7728343"></polygon><polygon class="ipc-watchlist-ribbon__bg-shadow" points="24 31.7728343 24 33.7728343 12.2436611 28.2926049 

The name of the movie

In [17]:
first_movie.div

<div class="sc-ab6fa25a-2 gOsifL"><div class="sc-e5a25b0f-0 jQjDIb dli-poster-container"><div class="ipc-poster ipc-poster--base ipc-poster--dynamic-width ipc-sub-grid-item ipc-sub-grid-item--span-2" role="group"><div aria-label="add to watchlist" class="ipc-watchlist-ribbon ipc-focusable ipc-watchlist-ribbon--s ipc-watchlist-ribbon--base ipc-watchlist-ribbon--loading ipc-watchlist-ribbon--onImage ipc-poster__watchlist-ribbon" role="button" tabindex="0"><svg class="ipc-watchlist-ribbon__bg" height="34px" role="presentation" viewbox="0 0 24 34" width="24px" xmlns="http://www.w3.org/2000/svg"><polygon class="ipc-watchlist-ribbon__bg-ribbon" fill="#000000" points="24 0 0 0 0 32 12.2436611 26.2926049 24 31.7728343"></polygon><polygon class="ipc-watchlist-ribbon__bg-hover" points="24 0 0 0 0 32 12.2436611 26.2926049 24 31.7728343"></polygon><polygon class="ipc-watchlist-ribbon__bg-shadow" points="24 31.7728343 24 33.7728343 12.2436611 28.2926049 0 34 0 32 12.2436611 26.2926049"></polygon></

In [18]:
first_movie.a

<a aria-label="View title page for Logan" class="ipc-lockup-overlay ipc-focusable" href="/title/tt3315342/?ref_=sr_i_1"><div class="ipc-lockup-overlay__screen"></div></a>

In [19]:
first_movie.h3

<h3 class="ipc-title__text">1. Logan</h3>

In [20]:
first_movie.h3.a

In [21]:
first_name = first_movie.find('h3',class_='ipc-title__text').text[3:]
first_name

'Logan'

**The year of the movie's release**

In [22]:
first_year = first_movie.find('span',class_='sc-b0691f29-8 ilsLEX dli-title-metadata-item').text
first_year

'2017'

**The IMDB rating**

In [23]:
first_movie.strong

In [28]:
first_imdb = first_movie.find('span',class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text[:3]
first_imdb

'8.1'

**The Metascore**

In [30]:
first_mscore = first_movie.find('span',class_='sc-b0901df4-0 bcQdDJ metacritic-score-box').text
first_mscore

'77'

**The number of votes**

In [36]:
first_votes = first_movie.find('span', class_='ipc-rating-star--voteCount').text[1:]
first_votes

'(827K)'

**The script**

In [64]:
#Lists to store the scraped data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []
# Extract data from individual movie container

for container in movie_containers:
# If the movie has a Metascore, then extract:
    if container.find('span',class_='sc-b0901df4-0 bcQdDJ metacritic-score-box') is not None:
        
        name = container.find('h3',class_='ipc-title__text').text[3:]
        names.append(name)
        
        year = container.find('span', class_='sc-b0691f29-8 ilsLEX dli-title-metadata-item').text
        years.append(year)
        
        imdb_rating = container.find('span',class_='ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text[:3]
        imdb_ratings.append(imdb_rating)
        
        metascore = int(container.find('span', class_='sc-b0901df4-0 bcQdDJ metacritic-score-box').text)
        metascores.append(metascore)
        
        vote = container.find('span', class_='ipc-rating-star--voteCount').text[1:]
        votes.append(vote)

In [65]:
import pandas as pd
test_df = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(test_df.info())
test_df

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41 entries, 0 to 40
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      41 non-null     object
 1   year       41 non-null     object
 2   imdb       41 non-null     object
 3   metascore  41 non-null     int64 
 4   votes      41 non-null     object
dtypes: int64(1), object(4)
memory usage: 1.7+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,2017,8.1,77,(827K)
1,Thor: Ragnarok,2017,7.9,74,(813K)
2,Guardians of the Galaxy Vol. 2,2017,7.6,67,(756K)
3,Dunkirk,2017,7.8,94,(736K)
4,Spider-Man: Homecoming,2017,7.4,73,(716K)
5,Wonder Woman,2017,7.3,76,(698K)
6,Get Out,2017,7.8,85,(691K)
7,Star Wars: Episode VIII - The Last Jedi,2017,6.9,84,(670K)
8,Blade Runner 2049,2017,8.0,81,(658K)
9,Baby Driver,2017,7.5,86,(605K)


**The script for multiple pages**

In [94]:
from time import time
from time import sleep
from random import randint

from IPython.core.display import clear_output
pages = [ '1','2','3','4','5']
years_url = [ '2017', '2018', '2019', '2020']

# Redeclaring the lists to store data in
names = []
years = []
imdb_ratings = []
metascores = []
votes = []

# Preparing the monitoring of the loop
start_time = time()
requests = 0

#For every year in the interval 2000-2017
for year_url in years_url:

        # Make a get request
        url = f'https://www.imdb.com/search/title?release_date={year_url}-01-01,{year_url}-12-31&sort=num_votes,desc&page=1'
        agent = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"}
        response = get(url,headers = agent)
        print(response.text[:500])
        
        # Pause the loop
        sleep(randint(1,5))
        
        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s'.format(requests, requests/elapsed_time))
        clear_output(wait = True)
        
        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            print('Request: {}; Status code: {}'.format(requests, response.status_code))
            
        # Break the loop if the number of requests is greater than expected
        if requests > 72:
            print('Number of requests was greater than expected.')
            break
        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')
        
        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'sc-ab6fa25a-3 bVYfLY dli-parent')
        
        # For every movie of these 50
        for container in mv_containers:
            # If the movie has a Metascore, then:
            if container.find('span', class_ = 'sc-b0901df4-0 bcQdDJ metacritic-score-box') is not None:
                # Scrape the name
                name = container.find('h3',class_='ipc-title__text').text[3:]
                names.append(name)
                
                # Scrape the year
                year = container.find('span', class_ = 'sc-b0691f29-8 ilsLEX dli-title-metadata-item').text
                years.append(year)
                
                # Scrape the IMDB rating
                imdb_rating = container.find('span', class_ = 'ipc-rating-star ipc-rating-star--base ipc-rating-star--imdb ratingGroup--imdb-rating').text[:3]
                imdb_ratings.append(imdb_rating)
                
                # Scrape the Metascore
                metascore = container.find('span', class_ = 'sc-b0901df4-0 bcQdDJ metacritic-score-box').text
                metascores.append(metascore)
                
                # Scrape the number of votes
                vote = container.find('span', class_ = 'ipc-rating-star--voteCount').text[1:]
                votes.append(vote)

<!DOCTYPE html><html lang="en-US" xmlns:og="http://opengraphprotocol.org/schema/" xmlns:fb="http://www.facebook.com/2008/fbml"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width"/><script>if(typeof uet === 'function'){ uet('bb', 'LoadTitle', {wb: 1}); }</script><script>window.addEventListener('load', (event) => {
        if (typeof window.csa !== 'undefined' && typeof window.csa === 'function') {
            var csaLatencyPlugin = window.csa('Content', {
             
Request:4; Frequency: 0.1575987123833381 requests/s


In [95]:
movie_ratings = pd.DataFrame({'movie': names,
'year': years,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes
})
print(movie_ratings.info())
movie_ratings.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   movie      157 non-null    object
 1   year       157 non-null    object
 2   imdb       157 non-null    object
 3   metascore  157 non-null    object
 4   votes      157 non-null    object
dtypes: object(5)
memory usage: 6.3+ KB
None


Unnamed: 0,movie,year,imdb,metascore,votes
0,Logan,2017,8.1,77,(827K)
1,Thor: Ragnarok,2017,7.9,74,(813K)
2,Guardians of the Galaxy Vol. 2,2017,7.6,67,(756K)
3,Dunkirk,2017,7.8,94,(736K)
4,Spider-Man: Homecoming,2017,7.4,73,(716K)
5,Wonder Woman,2017,7.3,76,(698K)
6,Get Out,2017,7.8,85,(691K)
7,Star Wars: Episode VIII - The Last Jedi,2017,6.9,84,(670K)
8,Blade Runner 2049,2017,8.0,81,(658K)
9,Baby Driver,2017,7.5,86,(605K)


In [96]:
movie_ratings.tail(10)

Unnamed: 0,movie,year,imdb,metascore,votes
147,The Hunt,2020,6.5,50,(128K)
148,Greyhound,2020,7.0,64,(114K)
149,Hamilton,2020,8.3,88,(112K)
150,Eurovision Song Contest: The Story of Fire Saga,2020,6.5,50,(102K)
151,I'm Thinking of Ending Things,2020,6.6,78,(99K)
152,Project Power,2020,6.0,51,(97K)
153,Spenser Confidential,2020,6.2,49,(97K)
154,Underwater,2020,5.9,48,(97K)
155,Minari,2020,7.4,89,(96K)
156,News of the World,2020,6.8,73,(95K)


In [97]:
movie_ratings.to_csv('movie_ratings.csv')

**Data Preparation**

Data preprocessing

Data Processing is a process of cleaning the raw data i.e. the data is collected in the real world and is converted to a clean data set. In other words, whenever the data is
gathered from different sources it is collected in a raw format and this data isn’t feasible for the analysis. Therefore, certain steps are executed to convert the data into a small
clean data set, this part of the process is called as data preprocessing.

Most of the real-world data is messy, some of these types of data are: 1. Missing data: Missing data can be found when it is not continuously created or due to technical
issues in the application (IOT system). 2. Noisy Data This type of data is also called outliners, this can occur due to human errors (human manually gathering the data) or
some technical problem of the device at the time of collection of data. 3. Inconsistent data: This type of data might be collected due to human errors (mistakes with the
name or values) or duplication of data.

These are some of the basic pre processing techniques that can be used to convert raw data. 1. Conversion of data: As we know that Machine Learning models can only
handle numeric features, hence categorical and ordinal data must be somehow converted into numeric features. 2. Ignoring the missing values: Whenever we encounter
missing data in the data set then we can remove the row or column of data depending on our need. This method is known to be efficient but it shouldn’t be performed if there
are a lot of missing values in the dataset. 3. Filling the missing values: Whenever we encounter missing data in the data set then we can fill the missing data manually, most
commonly the mean, median or highest frequency value is used.

Example of Data Preparation of movie_rating.csv

In [98]:
movie_ratings = pd.read_csv('movie_ratings.csv')

In [99]:
movie_ratings['year'].unique()

array([2017, 2018, 2019, 2020], dtype=int64)

In [100]:
movie_ratings.dtypes

Unnamed: 0      int64
movie          object
year            int64
imdb          float64
metascore       int64
votes          object
dtype: object

In [101]:
movie_ratings['year'] = movie_ratings['year'].astype(int)

In [102]:
movie_ratings['year'].unique()

array([2017, 2018, 2019, 2020])

In [103]:
movie_ratings.dtypes

Unnamed: 0      int64
movie          object
year            int32
imdb          float64
metascore       int64
votes          object
dtype: object

In [104]:
movie_ratings.head(10)

Unnamed: 0.1,Unnamed: 0,movie,year,imdb,metascore,votes
0,0,Logan,2017,8.1,77,(827K)
1,1,Thor: Ragnarok,2017,7.9,74,(813K)
2,2,Guardians of the Galaxy Vol. 2,2017,7.6,67,(756K)
3,3,Dunkirk,2017,7.8,94,(736K)
4,4,Spider-Man: Homecoming,2017,7.4,73,(716K)
5,5,Wonder Woman,2017,7.3,76,(698K)
6,6,Get Out,2017,7.8,85,(691K)
7,7,Star Wars: Episode VIII - The Last Jedi,2017,6.9,84,(670K)
8,8,Blade Runner 2049,2017,8.0,81,(658K)
9,9,Baby Driver,2017,7.5,86,(605K)


In [105]:
movie_ratings.tail(10)

Unnamed: 0.1,Unnamed: 0,movie,year,imdb,metascore,votes
147,147,The Hunt,2020,6.5,50,(128K)
148,148,Greyhound,2020,7.0,64,(114K)
149,149,Hamilton,2020,8.3,88,(112K)
150,150,Eurovision Song Contest: The Story of Fire Saga,2020,6.5,50,(102K)
151,151,I'm Thinking of Ending Things,2020,6.6,78,(99K)
152,152,Project Power,2020,6.0,51,(97K)
153,153,Spenser Confidential,2020,6.2,49,(97K)
154,154,Underwater,2020,5.9,48,(97K)
155,155,Minari,2020,7.4,89,(96K)
156,156,News of the World,2020,6.8,73,(95K)


In [106]:
movie_ratings

Unnamed: 0.1,Unnamed: 0,movie,year,imdb,metascore,votes
0,0,Logan,2017,8.1,77,(827K)
1,1,Thor: Ragnarok,2017,7.9,74,(813K)
2,2,Guardians of the Galaxy Vol. 2,2017,7.6,67,(756K)
3,3,Dunkirk,2017,7.8,94,(736K)
4,4,Spider-Man: Homecoming,2017,7.4,73,(716K)
...,...,...,...,...,...,...
152,152,Project Power,2020,6.0,51,(97K)
153,153,Spenser Confidential,2020,6.2,49,(97K)
154,154,Underwater,2020,5.9,48,(97K)
155,155,Minari,2020,7.4,89,(96K)
