## Importing libraries for web scraping
importing pandas to parse and store the information


In [48]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

## Fetching the url containing top rated 50 movies on IMDB

<Response [200]> This output confirms that the url is successfully fetched

In [49]:
url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"
page = requests.get(url)
page

<Response [200]>

## Creating soup object

In [24]:
soup = BeautifulSoup(page.content,"html.parser")
print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 100"
(Sorted by IMDb Rating Descending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/search/title/?groups=top_100" rel="canonical"/>
  <meta content="http://www.i

## Scraping movie names and its ratings


All the movie names are listed under the tag h3 and class "lister-item-header"

In [28]:
# scrap the movie names
scraped_movies = soup.find_all('h3', class_ = 'lister-item-header')
scraped_movies

[<h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">1.</span>
 <a href="/title/tt0111161/">The Shawshank Redemption</a>
 <span class="lister-item-year text-muted unbold">(1994)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">2.</span>
 <a href="/title/tt0068646/">The Godfather</a>
 <span class="lister-item-year text-muted unbold">(1972)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">3.</span>
 <a href="/title/tt0468569/">The Dark Knight</a>
 <span class="lister-item-year text-muted unbold">(2008)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">4.</span>
 <a href="/title/tt0167260/">The Lord of the Rings: The Return of the King</a>
 <span class="lister-item-year text-muted unbold">(2003)</span>
 </h3>,
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">5.</span>

In [30]:
# Scraping is the act of extraction, for instance from crawling.
# Parsing is basically breaking it down into pieces, constituent parts, or segments.

# parse the movie names
movies = []
for movie in scraped_movies:
        movies.append(movie.get_text().strip())
movies

['1.\nThe Shawshank Redemption\n(1994)',
 '2.\nThe Godfather\n(1972)',
 '3.\nThe Dark Knight\n(2008)',
 '4.\nThe Lord of the Rings: The Return of the King\n(2003)',
 "5.\nSchindler's List\n(1993)",
 '6.\nThe Godfather Part II\n(1974)',
 '7.\n12 Angry Men\n(1957)',
 '8.\nPulp Fiction\n(1994)',
 '9.\nInception\n(2010)',
 '10.\nThe Lord of the Rings: The Two Towers\n(2002)',
 '11.\nFight Club\n(1999)',
 '12.\nThe Lord of the Rings: The Fellowship of the Ring\n(2001)',
 '13.\nForrest Gump\n(1994)',
 '14.\nIl buono, il brutto, il cattivo\n(1966)',
 '15.\nThe Matrix\n(1999)',
 '16.\nGoodfellas\n(1990)',
 '17.\nThe Empire Strikes Back\n(1980)',
 "18.\nOne Flew Over the Cuckoo's Nest\n(1975)",
 '19.\nTop Gun: Maverick\n(2022)',
 '20.\nInterstellar\n(2014)',
 '21.\nCidade de Deus\n(2002)',
 '22.\nSen to Chihiro no kamikakushi\n(2001)',
 '23.\nSaving Private Ryan\n(1998)',
 '24.\nThe Green Mile\n(1999)',
 '25.\nLa vita è bella\n(1997)',
 '26.\nSe7en\n(1995)',
 '27.\nTerminator 2: Judgment Day\n(

In the above output \n is encountered which is unwanted 
Hence we will use replace function in further codes to replace \n with white-space

In [32]:
movies = []
for movie in scraped_movies:
        movie = movie.get_text().replace('\n'," ")
        movie = movie.strip()
        movies.append(movie)
movies

['1. The Shawshank Redemption (1994)',
 '2. The Godfather (1972)',
 '3. The Dark Knight (2008)',
 '4. The Lord of the Rings: The Return of the King (2003)',
 "5. Schindler's List (1993)",
 '6. The Godfather Part II (1974)',
 '7. 12 Angry Men (1957)',
 '8. Pulp Fiction (1994)',
 '9. Inception (2010)',
 '10. The Lord of the Rings: The Two Towers (2002)',
 '11. Fight Club (1999)',
 '12. The Lord of the Rings: The Fellowship of the Ring (2001)',
 '13. Forrest Gump (1994)',
 '14. Il buono, il brutto, il cattivo (1966)',
 '15. The Matrix (1999)',
 '16. Goodfellas (1990)',
 '17. The Empire Strikes Back (1980)',
 "18. One Flew Over the Cuckoo's Nest (1975)",
 '19. Top Gun: Maverick (2022)',
 '20. Interstellar (2014)',
 '21. Cidade de Deus (2002)',
 '22. Sen to Chihiro no kamikakushi (2001)',
 '23. Saving Private Ryan (1998)',
 '24. The Green Mile (1999)',
 '25. La vita è bella (1997)',
 '26. Se7en (1995)',
 '27. Terminator 2: Judgment Day (1991)',
 '28. The Silence of the Lambs (1991)',
 '29. 

In [34]:
scraped_ratings = soup.find_all('div', class_="inline-block ratings-imdb-rating")
scraped_ratings

[<div class="inline-block ratings-imdb-rating" data-value="9.3" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.3</strong>
 </div>,
 <div class="inline-block ratings-imdb-rating" data-value="9.2" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.2</strong>
 </div>,
 <div class="inline-block ratings-imdb-rating" data-value="9" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.0</strong>
 </div>,
 <div class="inline-block ratings-imdb-rating" data-value="9" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.0</strong>
 </div>,
 <div class="inline-block ratings-imdb-rating" data-value="9" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.0</strong>
 </div>,
 <div class="inline-block ratings-imdb-rating" data-value="9" name="ir">
 <span class="global-sprite rating-star imdb-rating"></span>
 <strong>9.0</strong>
 </div>,
 <div 

In [36]:
ratings = []
for rating in scraped_ratings:
    rating = rating.get_text().replace('\n'," ")
    ratings.append(rating)
ratings    
    

['  9.3 ',
 '  9.2 ',
 '  9.0 ',
 '  9.0 ',
 '  9.0 ',
 '  9.0 ',
 '  9.0 ',
 '  8.9 ',
 '  8.8 ',
 '  8.8 ',
 '  8.8 ',
 '  8.8 ',
 '  8.8 ',
 '  8.8 ',
 '  8.7 ',
 '  8.7 ',
 '  8.7 ',
 '  8.7 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.6 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ',
 '  8.5 ']

## Storing the Scraped Data

In [37]:
data = pd.DataFrame()
data['Movie Names'] = movies
data['Ratings'] = ratings
data.head()

Unnamed: 0,Movie Names,Ratings
0,1. The Shawshank Redemption (1994),9.3
1,2. The Godfather (1972),9.2
2,3. The Dark Knight (2008),9.0
3,4. The Lord of the Rings: The Return of the Ki...,9.0
4,5. Schindler's List (1993),9.0


## Storing the data in .csv file

In [39]:
# index = False ignores the index of the data frame
data.to_csv('IMDB Top 50 Movies.csv', index = False)