<a href="https://colab.research.google.com/github/varshathejes/web-scraping-beautiful-soup/blob/main/web_scraping.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 # Web Scraping using BeautifulSoup

In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36'}

data = []
for i in range(1, 11):  # loop through pages 1 to 10
    url = f'https://www.imdb.com/search/title/?count=100&groups=top_1000&sort=user_rating&start={i*100}'
    page = requests.get(url, headers=headers)
    soup = BeautifulSoup(page.text, 'html.parser')

    for item in soup.select('.lister-item'):
        # extract the director and stars using list comprehension
        d = item.select('a[href*="_dr_"]')
        s = item.select('a[href*="_st_"]')

        votes_gross = item.find_all("span", attrs={"name": ["nv", "nv"]})
        
        data.append({
            'title': item.h3.a.text,
            'url': 'https://www.imdb.com' + item.a['href'],
            'director': [x.text for x in d] if d else None,
            'stars': [x.text for x in s] if s else None,
            'certificate': item.select_one('.certificate') and item.select_one('.certificate').text or None,
            'runtime': item.select_one('.runtime') and item.select_one('.runtime').text or None,
            'genre': item.select_one('.genre') and item.select_one('.genre').text.strip() or None,
            'imdb_rating': item.select_one('.ratings-imdb-rating') and item.select_one('.ratings-imdb-rating').strong.text or None,
            'metascore': item.select_one('.metascore') and item.select_one('.metascore').text.strip() or None,
            'Year': item.select_one('.text-muted') and item.select_one('.text-muted').text.strip() or None,
            'votes': votes_gross[0].text.replace(",", "") if votes_gross else "",
            'gross' : votes_gross[1].text if len(votes_gross) > 1 else ""
        })



In [None]:
df = pd.DataFrame(data)
df.tail()

Unnamed: 0,title,url,director,stars,certificate,runtime,genre,imdb_rating,metascore,Year,votes,gross
896,Les yeux sans visage,https://www.imdb.com/title/tt0053459/?ref_=adv...,[Georges Franju],"[Pierre Brasseur, Alida Valli, Juliette Maynie...",Not Rated,90 min,"Drama, Horror",7.6,90.0,(1960),32785,$0.05M
897,The Odd Couple,https://www.imdb.com/title/tt0063374/?ref_=adv...,[Gene Saks],"[Jack Lemmon, Walter Matthau, John Fiedler, He...",,105 min,Comedy,7.6,86.0,(1968),35825,$44.53M
898,Shine,https://www.imdb.com/title/tt0117631/?ref_=adv...,[Scott Hicks],"[Geoffrey Rush, Armin Mueller-Stahl, Justin Br...",Not Rated,105 min,"Biography, Drama, Music",7.6,87.0,(1996),54965,$35.81M
899,The Invisible Man,https://www.imdb.com/title/tt0024184/?ref_=adv...,[James Whale],"[Claude Rains, Gloria Stuart, William Harrigan...",,71 min,"Horror, Sci-Fi",7.6,87.0,(1933),36955,
900,Celda 211,https://www.imdb.com/title/tt1242422/?ref_=adv...,[Daniel Monzón],"[Luis Tosar, Alberto Ammann, Antonio Resines, ...",A,113 min,"Action, Adventure, Crime",7.6,,(2009),68692,


In [None]:
df.shape

(901, 12)

In [None]:
df.certificate.unique()

array(['UA', 'A', '15+', 'U', '7', '18', 'G', None, 'R', 'PG', 'M/PG',
       '16', 'Not Rated', 'Unrated', 'PG-13', 'UA 16+', '(Banned)', 'U/A',
       '13'], dtype=object)

#15+, 16, PG-13, 13, UA 16+, U/A-> UA
#7, G, PG, M/PG, -> U
#18, R, Not Rated, (Banned), Unrated, -> A

In [None]:
df['certificate'] = df.certificate.replace(['15+', '16', 'PG-13', '13', 'UA 16+', 'U/A'],'UA')
df['certificate'] = df.certificate.replace(['7', 'G', 'PG', 'M/PG'],'U')
df['certificate'] = df.certificate.replace(['18', 'R', 'Not Rated', '(Banned)', 'Unrated'],'A')


In [None]:
df.certificate.unique()

array(['UA', 'A', 'U', None], dtype=object)

In [None]:
df['director'] =df['director'].apply(lambda x: ' '.join(x))

In [None]:
df1 = df.stars.apply(pd.Series)
df1

Unnamed: 0,0,1,2,3
0,Jim Carrey,Kate Winslet,Tom Wilkinson,Gerry Robert Byrne
1,Al Pacino,Michelle Pfeiffer,Steven Bauer,Mary Elizabeth Mastrantonio
2,Harvey Keitel,Tim Roth,Michael Madsen,Chris Penn
3,Malcolm McDowell,Patrick Magee,Michael Bates,Warren Clarke
4,Keir Dullea,Gary Lockwood,William Sylvester,Daniel Richter
...,...,...,...,...
896,Pierre Brasseur,Alida Valli,Juliette Mayniel,Alexandre Rignault
897,Jack Lemmon,Walter Matthau,John Fiedler,Herb Edelman
898,Geoffrey Rush,Armin Mueller-Stahl,Justin Braine,Sonia Todd
899,Claude Rains,Gloria Stuart,William Harrigan,Henry Travers


In [None]:
df['stars 1'] = df1[0]
df['stars 2'] = df1[1]
df['stars 3'] = df1[2]
df['stars 4'] = df1[3]
df = df.drop('stars',axis=1)

In [None]:
df = df.drop(['url'],axis=1)

In [None]:
df.head()

Unnamed: 0,title,director,certificate,runtime,genre,imdb_rating,metascore,Year,votes,gross,stars 1,stars 2,stars 3,stars 4
0,Eternal Sunshine of the Spotless Mind,Michel Gondry,UA,108 min,"Drama, Romance, Sci-Fi",8.3,89,(2004),1019399,$34.40M,Jim Carrey,Kate Winslet,Tom Wilkinson,Gerry Robert Byrne
1,Scarface,Brian De Palma,A,170 min,"Crime, Drama",8.3,65,(1983),853108,$45.60M,Al Pacino,Michelle Pfeiffer,Steven Bauer,Mary Elizabeth Mastrantonio
2,Reservoir Dogs,Quentin Tarantino,UA,99 min,"Crime, Thriller",8.3,79,(1992),1030252,$2.83M,Harvey Keitel,Tim Roth,Michael Madsen,Chris Penn
3,A Clockwork Orange,Stanley Kubrick,A,136 min,"Crime, Sci-Fi",8.3,77,(1971),838644,$6.21M,Malcolm McDowell,Patrick Magee,Michael Bates,Warren Clarke
4,2001: A Space Odyssey,Stanley Kubrick,U,149 min,"Adventure, Sci-Fi",8.3,84,(1968),676754,$56.95M,Keir Dullea,Gary Lockwood,William Sylvester,Daniel Richter


In [None]:
def run(col):
    temp=[]
    for i in col:
        i = i[1:len(i)-1]
        temp.append(i)
    return temp
temp =run(df.gross)
print(temp)


['34.40', '45.60', '2.83', '6.21', '56.95', '0.69', '46.36', '191.80', '309.13', '293.00', '159.60', '5.32', '44.82', '33.23', '6.86', '415.00', '', '', '13', '1.59', '3.20', '8.82', '1.24', '11', '13.28', '18.60', '7.10', '0.03', '0.06', '5.72', '1.22', '', '12.39', '0.33', '', '6.60', '5.45', '0.54', '0.90', '', '', '804.75', '116.90', '128.01', '402.45', '28.26', '74.28', '85.08', '159.23', '293.51', '125.62', '206.85', '30.33', '13', '40.22', '70.10', '42.44', '83.01', '197.17', '4.71', '13.78', '356.46', '170.74', '37.63', '198.68', '380.84', '3.90', '13.66', '70.51', '64.62', '5.51', '101.16', '8.49', '25.00', '1.23', '6.39', '15', '23.38', '12.10', '47.70', '4.14', '44.91', '20.19', '', '', '15.00', '', '17', '0.01', '0.74', '0.10', '0.01', '5.01', '1.37', '14', '20', '', '0.93', '0.08', '1.22', '19', '0.02', '3.11', '22', '24', '', '23', '0.98', '4.19', '1.63', '0.07', '0.04', '381.01', '169.71', '61.00', '117.24', '167.77', '164.62', '59.10', '154.06', '54.51', '67.21', '52.29

In [None]:
df = df.drop(['runtime'],axis=1)

In [None]:
df['gross']=temp

In [None]:
def string(col):
    list1=[]
    for i in col:
        i = i[1:5]
        list1.append(i)
    return list1
temp1 = string(df.Year)

In [None]:
df = df.drop(['Year'],axis=1)

In [None]:
df['Year'] = temp1

In [None]:
df.head()

Unnamed: 0,title,director,certificate,genre,imdb_rating,metascore,votes,stars 1,stars 2,stars 3,stars 4,gross,Year
0,Eternal Sunshine of the Spotless Mind,Michel Gondry,UA,"Drama, Romance, Sci-Fi",8.3,89,1019399,Jim Carrey,Kate Winslet,Tom Wilkinson,Gerry Robert Byrne,34.4,2004
1,Scarface,Brian De Palma,A,"Crime, Drama",8.3,65,853108,Al Pacino,Michelle Pfeiffer,Steven Bauer,Mary Elizabeth Mastrantonio,45.6,1983
2,Reservoir Dogs,Quentin Tarantino,UA,"Crime, Thriller",8.3,79,1030252,Harvey Keitel,Tim Roth,Michael Madsen,Chris Penn,2.83,1992
3,A Clockwork Orange,Stanley Kubrick,A,"Crime, Sci-Fi",8.3,77,838644,Malcolm McDowell,Patrick Magee,Michael Bates,Warren Clarke,6.21,1971
4,2001: A Space Odyssey,Stanley Kubrick,U,"Adventure, Sci-Fi",8.3,84,676754,Keir Dullea,Gary Lockwood,William Sylvester,Daniel Richter,56.95,1968


In [None]:
data = pd.read_csv('imbd1.csv')

In [None]:
data['Year']=data.Year.astype(int)

In [None]:
data.dtypes

title           object
director        object
certificate     object
genre           object
imdb_rating    float64
metascore        int64
votes            int64
stars 1         object
stars 2         object
stars 3         object
stars 4         object
runtime        float64
Year             int64
dtype: object

In [None]:
from sklearn.preprocessing import LabelEncoder

column_to_encode = 'genre'

encoder=LabelEncoder()

encoder.fit(data[column_to_encode])

data[column_to_encode] = encoder.transform(data[column_to_encode])

In [None]:
data['genre']

0       36
1      157
2      184
3      111
4      165
      ... 
896    181
897    129
898    166
899    129
900      3
Name: genre, Length: 901, dtype: int32