**Created by: ayoub touti, mohamed aziz catalan**

# **data scraping from imdb**

In [None]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import pandas as pd
from textblob import TextBlob
from requests import get
from time import sleep
from random import randint
from warnings import warn
import matplotlib.pyplot as plt



In [None]:
pages = np.arange(1, 20, 50) 
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Mandarin

In [None]:
#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
runtimes = []
imdb_ratings = []
imdb_ratings_standardized = []
metascores = []
votes = []


In [None]:
for page in pages:
    
    #get request
    response = get("https://www.imdb.com/search/title?genres=sci-fi&" 
                   + "start=" 
                   + str(page) 
                   + "&explore=title_type,genres&ref_=adv_prv")
    
    sleep(randint(8,15))
     
    #throw warning for status codes that are not 200
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(requests, response.status_code))

    #parse the content of current iteration of request
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    movie_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')
    
    #extract the 50 movies for that page
    for container in movie_containers:

        #conditional for all with metascore
        if container.find('div', class_ = 'ratings-metascore') is not None:

            #title
            title = container.h3.a.text
            titles.append(title)

            #year released
            year = container.h3.find('span', class_= 'lister-item-year text-muted unbold').text
            years.append(year)

            #rating
            rating = container.p.find('span', class_= 'certificate').text
            ratings.append(rating)

            #genre
            genre = container.p.find('span', class_ = 'genre').text
            genres.append(genre)

            #runtime
            time = container.p.find('span', class_ = 'runtime').text
            runtimes.append(time)

            #IMDB ratings
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)

            #Metascore
            m_score = container.find('span', class_ = 'metascore').text
            metascores.append(int(m_score))

            #Number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))

In [None]:
sci_fi_df = pd.DataFrame({'movie': titles,
                      'year': years,
                      'rating': ratings,
                      'genre': genres,
                      'runtime_min': runtimes,
                      'imdb': imdb_ratings,
                      'metascore': metascores,
                      'votes': votes}
                      )

sci_fi_df.loc[:, 'year'] = sci_fi_df['year'].str[-5:-1] # two more data transformations after scraping
# Drop 'ovie' bug
# Make year an int
sci_fi_df['n_imdb'] = sci_fi_df['imdb'] * 10
final_df = sci_fi_df.loc[sci_fi_df['year'] != 'ovie'] # One small issue with the scrape on these two movies so just dropping those ones.
final_df.loc[:, 'year'] = pd.to_numeric(final_df['year'])

In [None]:
sci_fi_df

Unnamed: 0,movie,year,rating,genre,runtime_min,imdb,metascore,votes,n_imdb
0,Black Panther: Wakanda Forever,2022,PG-13,"\nAction, Adventure, Drama",161 min,7.3,67,91542,73.0
1,Black Adam,2022,PG-13,"\nAction, Adventure, Fantasy",125 min,6.8,41,133827,68.0
2,Black Panther,2018,PG-13,"\nAction, Adventure, Sci-Fi",134 min,7.3,88,764271,73.0
3,Nope,2022,R,"\nHorror, Mystery, Sci-Fi",130 min,6.9,77,161414,69.0
4,Everything Everywhere All at Once,2022,R,"\nAction, Adventure, Comedy",139 min,8.1,81,240086,81.0
5,Thor: Love and Thunder,2022,PG-13,"\nAction, Adventure, Comedy",118 min,6.3,57,312375,63.0
6,Strange World,2022,PG,"\nAnimation, Action, Adventure",102 min,5.8,65,1284,58.0
7,Avatar,2009,PG-13,"\nAction, Adventure, Fantasy",162 min,7.8,83,1246348,78.0
8,Spider-Man: No Way Home,2021,PG-13,"\nAction, Adventure, Fantasy",148 min,8.3,71,745579,83.0
9,Interstellar,2014,PG-13,"\nAdventure, Drama, Sci-Fi",169 min,8.6,74,1810970,86.0


# **Reviews**

In [None]:
page = requests.get("https://www.imdb.com/title/tt6443346/reviews?ref_=tt_ov_rt")
soup = BeautifulSoup(page.content, "html.parser")
rev = soup.findAll('div', class_='text show-more__control')
movies = []
for movie in rev:
    movies.append(movie.text)
df_subset = pd.DataFrame()
df_subset['Black Adam']=movies

In [None]:
df_subset

Unnamed: 0,Black Adam
0,Was this script the result of a 5th grade clas...
1,Is there ever gonna be a character arc? Terrib...
2,"I hoped ""Black Adam"" would be a decent entry i..."
3,Thought Bland Adam was another misfire for the...
4,"Black Adam is not without its flaws, but i sti..."
5,Black Adam is possibly one of the most generic...
6,I was shocked by how much I disliked Black Ada...
7,Black Adam is the perfect example of how inacc...
8,"This movie is solely for kids and teens, the r..."
9,In the superhero movie genre I would rank this...


# **Sentiments**

In [None]:
df = pd.DataFrame()

In [None]:
df['Reviews of people']=movies

In [None]:
list=[]
for i in range(len(movies)):
    text=TextBlob(movies[i])
    x=text.sentiment.polarity # x between -1 and 1
    list.append(round(x,2))

In [None]:
sent=[]
j=[]
for j in list:
    if j>0:
        x="positive"
    elif j==0:
        x="neutre"
    else :
        x="negative"
    sent.append(x) 

In [None]:
df['sentiment']=sent

In [None]:
df

Unnamed: 0,Reviews of people,sentiment
0,Was this script the result of a 5th grade clas...,negative
1,Is there ever gonna be a character arc? Terrib...,negative
2,"I hoped ""Black Adam"" would be a decent entry i...",positive
3,Thought Bland Adam was another misfire for the...,negative
4,"Black Adam is not without its flaws, but i sti...",positive
5,Black Adam is possibly one of the most generic...,positive
6,I was shocked by how much I disliked Black Ada...,negative
7,Black Adam is the perfect example of how inacc...,positive
8,"This movie is solely for kids and teens, the r...",negative
9,In the superhero movie genre I would rank this...,positive
