## Scraping Movie Data from imdb 

In [48]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from selenium import webdriver
import time as time
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By

In [2]:
# Instalation
s = Service((r'C:\Users\eli\Desktop\AllMyRepos\Web-Scraping-with-BeautifulSoup-and-Selenium\Project No.9\chromedriver.exe'))
chromeOptions = Options()
chromeOptions.headless = False
driver = webdriver.Chrome(service=s, options=chromeOptions)

In [3]:
# get URL
url = 'https://www.imdb.com/list/ls055592025/'
driver.get(url)

### Results for one item

In [4]:
# soup object
soup = BeautifulSoup(driver.page_source, 'lxml')

In [5]:
# results
results = soup.findAll('div', class_ = 'lister-item mode-detail')
len(results)

100

In [6]:
# title
title = results[0].find('h3', class_ = 'lister-item-header').text.replace('\n', '')
title

'1.The Godfather(1972)'

In [7]:
# information
information = results[0].find('p', class_ = "").text.replace('\n', '')
information

'The aging patriarch of an organized crime dynasty in postwar New York City transfers control of his clandestine empire to his reluctant youngest son.'

In [8]:
# description
description = results[0].find(class_ = 'list-description').find('p')
description

<p>Actors: 5 Stars
Direction: 5 Stars
Screenplay: 5 Stars<br/><br/>Oscars: 3   
Oscar Nominations: 11
BAFTA Awards: 0
BAFTA Nominations: 4
Golden Globes: 6
Golden Globe Nominations: 8</p>

In [9]:
# rating
rating = results[0].find(class_ = 'ipl-rating-star__rating').text
rating

'9.2'

In [10]:
# metascore
metascore = results[0].find('div', class_ = 'inline-block ratings-metascore').span.text.replace(' ', '')
metascore

'100'

In [11]:
# director
director = results[0].findAll('a')[-5].text
director

'Francis Ford Coppola'

In [12]:
# stars
stars_list = []
stars = results[0].findAll('a')[-4 : ]
for star in stars:
    stars_list.append(star.text)
stars_list

['Marlon Brando', 'Al Pacino', 'James Caan', 'Diane Keaton']

### Scroling and collecting results for all items

In [13]:
# lists
titles_ = []
informations_ = []
descriptions_ = []
ratings_ = []
metascores_ = []
directors_ = []
stars_ = []

In [14]:
# Scrolling
last_height = driver.execute_script('return document.documentElement.scrollTop')
while True:
    # Soup Object
    soup = BeautifulSoup(driver.page_source, 'lxml')
    results = soup.findAll('div', class_ = 'lister-item mode-detail')

    for result in results:
        title = result.find('h3', class_ = 'lister-item-header').text.replace('\n', '')
        titles_.append(title)
        information = result.find('p', class_ = "").text.replace('\n', '')
        informations_.append(information)
        description = result.find(class_ = 'list-description').find('p')
        descriptions_.append(description)
        rating = result.find(class_ = 'ipl-rating-star__rating').text
        ratings_.append(rating)
        try:
            metascore = result.find('div', class_ = 'inline-block ratings-metascore').span.text.replace(' ', '')
            metascores_.append(metascore)
        except:
            metascores_.append(np.nan)
        
        director = result.findAll('a')[-5].text
        directors_.append(director)
        stars_list = []
        stars = result.findAll('a')[-4 : ]
        for star in stars:
            stars_list.append(star.text)
        stars_.append(stars_list)

    driver.execute_script("window.scrollBy(0, 10500);") 
    time.sleep(2)
    new_height = driver.execute_script('return document.documentElement.scrollTop;')
    if new_height == last_height:
        break
    last_height = new_height

In [36]:
df_movies = pd.DataFrame({ 'Title' : titles_,'Information': informations_,'Description': descriptions_, 
                           'Rating' : ratings_, 'Metascore' : metascores_,'Director' : directors_,
                           'Stars' : stars_})

In [27]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        600 non-null    object
 1   Information  600 non-null    object
 2   Description  600 non-null    object
 3   Rating       600 non-null    object
 4   Metascore    582 non-null    object
 5   Director     600 non-null    object
 6   Stars        600 non-null    object
dtypes: object(7)
memory usage: 32.9+ KB


In [17]:
df_movies.head()

Unnamed: 0,Title,Information,Description,Rating,Metascore,Director,Stars
0,1.The Godfather(1972),The aging patriarch of an organized crime dyna...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,9.2,100,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K..."
1,2.The Shawshank Redemption(1994),Two imprisoned men bond over a number of years...,[Actors: 4.8 Stars\nDirection: 5 Stars\nScreen...,9.3,81,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will..."
2,3.Schindler's List(1993),"In German-occupied Poland during World War II,...",[Actors: 4.9 Stars\nDirection: 5 Stars\nScreen...,9.0,94,Steven Spielberg,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car..."
3,4.Raging Bull(1980),"The life of boxer Jake LaMotta, whose violence...",[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.2,89,Martin Scorsese,"[Robert De Niro, Cathy Moriarty, Joe Pesci, Fr..."
4,5.Casablanca(1942),A cynical expatriate American cafe owner strug...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.5,100,Michael Curtiz,"[Humphrey Bogart, Ingrid Bergman, Paul Henreid..."


### Data Cleaning

In [28]:
df_movies['Description']

0      [Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...
1      [Actors: 4.8 Stars\nDirection: 5 Stars\nScreen...
2      [Actors: 4.9 Stars\nDirection: 5 Stars\nScreen...
3      [Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...
4      [Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...
                             ...                        
595    [Actors: 4.8 Stars\nDirection: 5 Stars\nScreen...
596    [Actors: 4.8 Stars\nDirection: 4.9 Stars\nScre...
597    [*****\nActors: 4.7 Stars\nDirection: 4.7 Star...
598    [Actors: 4.8 Stars\nDirection: 4.9 Stars\nScre...
599    [Actors: 4.6 Stars\nDirection: 4.7 Stars\nScre...
Name: Description, Length: 600, dtype: object

In [37]:
new_description_ = []
chars = ['\n','*', '<p>','</p>', '<br/>', '<\br>', 'Stars']
for index, row in df_movies.iterrows():
    new_description = row['Description']
    for i in chars:
        new_description = str(new_description).replace(i, '')
    new_description_.append(new_description)
new_description_
df_movies["New_description"] = new_description_
df_movies.head()

Unnamed: 0,Title,Information,Description,Rating,Metascore,Director,Stars,New_description
0,1.The Godfather(1972),The aging patriarch of an organized crime dyna...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,9.2,100,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...
1,2.The Shawshank Redemption(1994),Two imprisoned men bond over a number of years...,[Actors: 4.8 Stars\nDirection: 5 Stars\nScreen...,9.3,81,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",Actors: 4.8 Direction: 5 Screenplay: 4.9 Oscar...
2,3.Schindler's List(1993),"In German-occupied Poland during World War II,...",[Actors: 4.9 Stars\nDirection: 5 Stars\nScreen...,9.0,94,Steven Spielberg,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car...",Actors: 4.9 Direction: 5 Screenplay: 5 Oscars:...
3,4.Raging Bull(1980),"The life of boxer Jake LaMotta, whose violence...",[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.2,89,Martin Scorsese,"[Robert De Niro, Cathy Moriarty, Joe Pesci, Fr...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 2...
4,5.Casablanca(1942),A cynical expatriate American cafe owner strug...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.5,100,Michael Curtiz,"[Humphrey Bogart, Ingrid Bergman, Paul Henreid...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...


In [38]:
# convert list column into string
df_movies['Stars'] = [','.join(map(str, l)) for l in df_movies['Stars']]
df_movies.head()

Unnamed: 0,Title,Information,Description,Rating,Metascore,Director,Stars,New_description
0,1.The Godfather(1972),The aging patriarch of an organized crime dyna...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,9.2,100,Francis Ford Coppola,"Marlon Brando,Al Pacino,James Caan,Diane Keaton",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...
1,2.The Shawshank Redemption(1994),Two imprisoned men bond over a number of years...,[Actors: 4.8 Stars\nDirection: 5 Stars\nScreen...,9.3,81,Frank Darabont,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Actors: 4.8 Direction: 5 Screenplay: 4.9 Oscar...
2,3.Schindler's List(1993),"In German-occupied Poland during World War II,...",[Actors: 4.9 Stars\nDirection: 5 Stars\nScreen...,9.0,94,Steven Spielberg,"Liam Neeson,Ralph Fiennes,Ben Kingsley,Carolin...",Actors: 4.9 Direction: 5 Screenplay: 5 Oscars:...
3,4.Raging Bull(1980),"The life of boxer Jake LaMotta, whose violence...",[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.2,89,Martin Scorsese,"Robert De Niro,Cathy Moriarty,Joe Pesci,Frank ...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 2...
4,5.Casablanca(1942),A cynical expatriate American cafe owner strug...,[Actors: 5 Stars\nDirection: 5 Stars\nScreenpl...,8.5,100,Michael Curtiz,"Humphrey Bogart,Ingrid Bergman,Paul Henreid,Cl...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...


In [39]:
df_movies.drop(columns=['Description'], inplace=True)
df_movies.head()

Unnamed: 0,Title,Information,Rating,Metascore,Director,Stars,New_description
0,1.The Godfather(1972),The aging patriarch of an organized crime dyna...,9.2,100,Francis Ford Coppola,"Marlon Brando,Al Pacino,James Caan,Diane Keaton",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...
1,2.The Shawshank Redemption(1994),Two imprisoned men bond over a number of years...,9.3,81,Frank Darabont,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Actors: 4.8 Direction: 5 Screenplay: 4.9 Oscar...
2,3.Schindler's List(1993),"In German-occupied Poland during World War II,...",9.0,94,Steven Spielberg,"Liam Neeson,Ralph Fiennes,Ben Kingsley,Carolin...",Actors: 4.9 Direction: 5 Screenplay: 5 Oscars:...
3,4.Raging Bull(1980),"The life of boxer Jake LaMotta, whose violence...",8.2,89,Martin Scorsese,"Robert De Niro,Cathy Moriarty,Joe Pesci,Frank ...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 2...
4,5.Casablanca(1942),A cynical expatriate American cafe owner strug...,8.5,100,Michael Curtiz,"Humphrey Bogart,Ingrid Bergman,Paul Henreid,Cl...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...


In [40]:
# rename column
df_movies = df_movies.rename(columns = {'New_description' : 'Description'})
df_movies.head()

Unnamed: 0,Title,Information,Rating,Metascore,Director,Stars,Description
0,1.The Godfather(1972),The aging patriarch of an organized crime dyna...,9.2,100,Francis Ford Coppola,"Marlon Brando,Al Pacino,James Caan,Diane Keaton",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...
1,2.The Shawshank Redemption(1994),Two imprisoned men bond over a number of years...,9.3,81,Frank Darabont,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Actors: 4.8 Direction: 5 Screenplay: 4.9 Oscar...
2,3.Schindler's List(1993),"In German-occupied Poland during World War II,...",9.0,94,Steven Spielberg,"Liam Neeson,Ralph Fiennes,Ben Kingsley,Carolin...",Actors: 4.9 Direction: 5 Screenplay: 5 Oscars:...
3,4.Raging Bull(1980),"The life of boxer Jake LaMotta, whose violence...",8.2,89,Martin Scorsese,"Robert De Niro,Cathy Moriarty,Joe Pesci,Frank ...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 2...
4,5.Casablanca(1942),A cynical expatriate American cafe owner strug...,8.5,100,Michael Curtiz,"Humphrey Bogart,Ingrid Bergman,Paul Henreid,Cl...",Actors: 5 Direction: 5 Screenplay: 5 Oscars: 3...


In [44]:
# convert string columns into numeric
df_movies['Rating'] = pd.to_numeric(df_movies['Rating'], errors = 'coerce')
df_movies['Metascore'] = pd.to_numeric(df_movies['Metascore'], errors = 'coerce')

In [45]:
df_movies.drop_duplicates(keep='first', inplace=True, ignore_index=False)

In [46]:
df_movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 0 to 99
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        100 non-null    object 
 1   Information  100 non-null    object 
 2   Rating       100 non-null    float64
 3   Metascore    97 non-null     float64
 4   Director     100 non-null    object 
 5   Stars        100 non-null    object 
 6   Description  100 non-null    object 
dtypes: float64(2), object(5)
memory usage: 6.2+ KB


In [47]:
df_movies.to_excel('top_100_movies_all_time.xlsx', index = False)