### Web scraping - IMDB using BeautifulSoup

In [1]:
import pandas as pd
import requests  # used to requests URL
from bs4 import BeautifulSoup
#BeautifulSoup (bs4) is a python library that's dependent on other libraries to function properly.

### Request page source from URl

In [19]:
url='https://www.imdb.com/chart/top/'

In [20]:
page = requests.get(url) 
page
# validating the page before moving on - 200 responses 

<Response [200]>

In [21]:
# display the page source code
page.content

b'\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    \n    \n    \n\n    \n    \n    \n\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Top 250 Movies - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (typeof uex == \'function\') {\n      uex("ld", "LoadTitle", {wb: 1});\n    }\n

In [22]:
soup=BeautifulSoup(page.content,'html.parser')
# depend on the page we should select the parser
print(soup.prettify()) #it remove the newline and display in the form of html

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   Top 250 Movies - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "LoadTitle", {wb: 1});
    }
  </script>
  <link href="https://www.imdb.com/chart/top/" rel="canonical"/>
  <meta content="http://w

In [23]:
scraped_movie=soup.find_all('td',class_='titleColumn')
scraped_movie
# note id (td) and class in website itself

[<td class="titleColumn">
       1.
       <a href="/title/tt0111161/" title="Frank Darabont (dir.), Tim Robbins, Morgan Freeman">The Shawshank Redemption</a>
 <span class="secondaryInfo">(1994)</span>
 </td>,
 <td class="titleColumn">
       2.
       <a href="/title/tt0068646/" title="Francis Ford Coppola (dir.), Marlon Brando, Al Pacino">The Godfather</a>
 <span class="secondaryInfo">(1972)</span>
 </td>,
 <td class="titleColumn">
       3.
       <a href="/title/tt0071562/" title="Francis Ford Coppola (dir.), Al Pacino, Robert De Niro">The Godfather: Part II</a>
 <span class="secondaryInfo">(1974)</span>
 </td>,
 <td class="titleColumn">
       4.
       <a href="/title/tt0468569/" title="Christopher Nolan (dir.), Christian Bale, Heath Ledger">The Dark Knight</a>
 <span class="secondaryInfo">(2008)</span>
 </td>,
 <td class="titleColumn">
       5.
       <a href="/title/tt0050083/" title="Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb">12 Angry Men</a>
 <span class="secondaryInfo">

In [24]:
#parse movie name
movies=[]
for movie in scraped_movie:
    movie=movie.get_text().replace('\n','') # replacing newline
    movie=movie.strip(' ') # to remove all white spaces, tabs and newline
    movies.append(movie)
movies

['1.      The Shawshank Redemption(1994)',
 '2.      The Godfather(1972)',
 '3.      The Godfather: Part II(1974)',
 '4.      The Dark Knight(2008)',
 '5.      12 Angry Men(1957)',
 "6.      Schindler's List(1993)",
 '7.      The Lord of the Rings: The Return of the King(2003)',
 '8.      Pulp Fiction(1994)',
 '9.      Il buono, il brutto, il cattivo(1966)',
 '10.      The Lord of the Rings: The Fellowship of the Ring(2001)',
 '11.      Fight Club(1999)',
 '12.      Forrest Gump(1994)',
 '13.      Inception(2010)',
 '14.      The Lord of the Rings: The Two Towers(2002)',
 '15.      Star Wars: Episode V - The Empire Strikes Back(1980)',
 '16.      The Matrix(1999)',
 '17.      Goodfellas(1990)',
 "18.      One Flew Over the Cuckoo's Nest(1975)",
 '19.      Shichinin no samurai(1954)',
 '20.      Se7en(1995)',
 '21.      The Silence of the Lambs(1991)',
 '22.      Cidade de Deus(2002)',
 '23.      La vita è bella(1997)',
 "24.      It's a Wonderful Life(1946)",
 '25.      Star Wars(1977)

In [25]:
# scrap rating for movies
scraped_ratings=soup.find_all('td',class_='ratingColumn imdbRating')
scraped_ratings

[<td class="ratingColumn imdbRating">
 <strong title="9.2 based on 2,497,591 user ratings">9.2</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.1 based on 1,722,208 user ratings">9.1</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 1,195,379 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="9.0 based on 2,448,129 user ratings">9.0</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 737,802 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,278,655 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.9 based on 1,726,340 user ratings">8.9</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 1,928,269 user ratings">8.8</strong>
 </td>,
 <td class="ratingColumn imdbRating">
 <strong title="8.8 based on 724,141 user ratings">8.8</strong>
 </t

In [26]:
ratings=[]
for rating in scraped_ratings:
    rating=rating.get_text().replace('\n','')
    rating=rating.strip(' ')
    ratings.append(rating)
ratings

['9.2',
 '9.1',
 '9.0',
 '9.0',
 '8.9',
 '8.9',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.4',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.3',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',
 '8.2',


### Storing Scarped Data

In [27]:
data=pd.DataFrame()
data['Movie Name'] = movies
data['Ratings']=ratings
data.head()

Unnamed: 0,Movie Name,Ratings
0,1. The Shawshank Redemption(1994),9.2
1,2. The Godfather(1972),9.1
2,3. The Godfather: Part II(1974),9.0
3,4. The Dark Knight(2008),9.0
4,5. 12 Angry Men(1957),8.9


In [28]:
data.to_csv('IMDB Top Movies.csv',index=False)

### Extracting Reviews from flipkart website - BeautifulScoup

In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url='https://www.flipkart.com/birde-combo-pack-2-sports-shoes-running-men/product-reviews/itma4b1034f357f3?pid=SHOGYNMSJZTYFMM9&lid=LSTSHOGYNMSJZTYFMM94HXZWH&marketplace=FLIPKART'

In [3]:
page = requests.get(url)
page

<Response [200]>

In [4]:
page.text

'<!doctype html><html lang="en"><head><link href="https://rukminim1.flixcart.com" rel="preconnect"/><link rel="stylesheet" href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.8a1772.css"/><meta http-equiv="Content-type" content="text/html; charset=utf-8"/><meta http-equiv="X-UA-Compatible" content="IE=Edge"/><meta property="fb:page_id" content="102988293558"/><meta property="fb:admins" content="658873552,624500995,100000233612389"/><meta name="robots" content="noodp"/><link rel="shortcut icon" href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico"/><link type="application/opensearchdescription+xml" rel="search" href="/osdd.xml?v=2"/><meta property="og:type" content="website"/><meta name="og_site_name" property="og:site_name" content="Flipkart.com"/><link rel="apple-touch-icon" sizes="57x57" href="/apple-touch-icon-57x57.png"/><link rel="apple-touch-icon" sizes="72x72" href="/apple-touch-icon-72x72.png"/><link rel="apple-

In [6]:
soup=BeautifulSoup(page.text,'html.parser')
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <link href="https://rukminim1.flixcart.com" rel="preconnect"/>
  <link href="//static-assets-web.flixcart.com/www/linchpin/fk-cp-zion/css/app.chunk.8a1772.css" rel="stylesheet"/>
  <meta content="text/html; charset=utf-8" http-equiv="Content-type"/>
  <meta content="IE=Edge" http-equiv="X-UA-Compatible"/>
  <meta content="102988293558" property="fb:page_id"/>
  <meta content="658873552,624500995,100000233612389" property="fb:admins"/>
  <meta content="noodp" name="robots"/>
  <link href="https://static-assets-web.flixcart.com/www/promos/new/20150528-140547-favicon-retina.ico" rel="shortcut icon"/>
  <link href="/osdd.xml?v=2" rel="search" type="application/opensearchdescription+xml"/>
  <meta content="website" property="og:type"/>
  <meta content="Flipkart.com" name="og_site_name" property="og:site_name"/>
  <link href="/apple-touch-icon-57x57.png" rel="apple-touch-icon" sizes="57x57"/>
  <link href="/apple-touch-icon-72x72.png" rel="apple-tou

In [7]:
reviews=soup.find_all('div',class_='_6K-7Co')

In [8]:
reviews

[<div class="_6K-7Co">Very nice 🙂</div>,
 <div class="_6K-7Co">Order now nice shoes</div>,
 <div class="_6K-7Co">Good quality</div>,
 <div class="_6K-7Co">Nice product... Thnxx to flipcart</div>,
 <div class="_6K-7Co">Good product</div>,
 <div class="_6K-7Co">Good product</div>,
 <div class="_6K-7Co">Awesome product</div>,
 <div class="_6K-7Co">Good product</div>,
 <div class="_6K-7Co">Black shoes not good</div>,
 <div class="_6K-7Co">Wonderful shoes so comfortable to wear.<br/>Made my day.<br/>Thanks Birdie</div>]

In [9]:
review=[]
for item in reviews:
    item=item.get_text().replace('\n','')
    item=item.strip(' ')
    review.append(item)

In [10]:
review

['Very nice 🙂',
 'Order now nice shoes',
 'Good quality',
 'Nice product... Thnxx to flipcart',
 'Good product',
 'Good product',
 'Awesome product',
 'Good product',
 'Black shoes not good',
 'Wonderful shoes so comfortable to wear.Made my day.Thanks Birdie']

In [11]:
ratings=soup.find_all('div',class_='_3LWZlK _1BLPMq _3B8WaH')
ratings

[<div class="_3LWZlK _1BLPMq _3B8WaH">5</div>,
 <div class="_3LWZlK _1BLPMq _3B8WaH">5</div>,
 <div class="_3LWZlK _1BLPMq _3B8WaH">4<img class="_1wB99o" src="data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHdpZHRoPSIxMyIgaGVpZ2h0PSIxMiI+PHBhdGggZmlsbD0iI0ZGRiIgZD0iTTYuNSA5LjQzOWwtMy42NzQgMi4yMy45NC00LjI2LTMuMjEtMi44ODMgNC4yNTQtLjQwNEw2L

In [16]:
rating=[]
for item in ratings:
    item=item.get_text().replace('\n','')
    item=item.strip(' ')
    rating.append(item)

In [17]:
rating

['5', '5', '4', '5', '5', '5', '4', '3', '5']

In [18]:
import pandas as pd
data=pd.DataFrame()
data['Review'] = review
data['Ratings']= rating
data.head()

ValueError: Length of values (9) does not match length of index (10)