## Below is an explanation of the whole source code. You may directly skip to the last code cell if you wish to execute all of the code at once.

In [1]:
from bs4 import BeautifulSoup
import requests

### You need to restart jupyter kernel whenever you want to take user input again. Thats just how jupyter notebooks work

In [2]:
input = input("Enter your value: ") 

Enter your value: Game Of Thrones


In [3]:
response = requests.get('http://www.imdb.com/find?ref_=nv_sr_fn&q=' +input+ ' &s=tt')
# input helps in finding data of a movie dynamically entered by user

### You can use the below way if you dont want to take user input i.e if you ony enter a movie and search. In that case replace 'Stranger things' with whatever series/movie you want

In [4]:
#response = requests.get('http://www.imdb.com/find?ref_=nv_sr_fn&q= Stranger Things ! &s=tt')


In [5]:
# We use lxml parser
soup = BeautifulSoup(response.content, 'lxml')
# Fetching the movie id so that the browser can be directed to that webpage. Note out of many searches we are taking the 1st search result
table = soup.find('table',class_='findList')
movieid = table.tr.a['href']
movieid

'/title/tt0944947/'

#### 1st soup object is used to form the extract movie id from imdb page. 2nd soup object is used to extract data from from the newly formed imdb page based upon movieid

In [6]:
movielink = "http://www.imdb.com" + movieid
# Form the movie url which has to be used to extract data
moviepage = requests.get(movielink)
# get method fetches the source code from entire page
soup2 = BeautifulSoup(moviepage.content, 'lxml')
# we create another soup object to perform crawling on the actual moviepage


In [7]:
# Title
uncleaned = soup2.find('div', class_ = 'title_wrapper').h1.text
#it contains some weird text and extra &nbsp at end. So replace it with empty string
uncleaned =uncleaned.replace('\xa0','')
title = uncleaned.strip()
title

'Game of Thrones'

## For some of the variables like imdbrating, votes,metascore etc there are chances that these are not present on actual imdb page. Thus I have used try except to handle such cases and print appropriate statements at the end.  A counter is initialized to 1 and in except cases it goes to zero indicating absence of any occurence on imdb page

In [8]:
#imdb rating
imdbcounter=1
try:
    imdb_rating = float(soup2.select('.ratingValue span')[0].text)
except:
    imdbcounter=0
if(imdbcounter !=0):
    print(imdb_rating)
else:
    print("imdb rating is not given on imdb page")

9.3


In [9]:
#metascore
# Tv series dont have metascore. Thus we store the values in metascore variable only when they are present on imdb page.
try:
    metascore = float(soup2.select('.metacriticScore span')[0].text) if float(soup2.select('.metacriticScore span')[0].text) else None
except Exception as e:
    metascore=None
metascore

#### If you see cast details on imdb page, an inspect tells us that they are stored in two classes-odd and even. Thus we perform extracting of casts having class="odd" and then those having class="even". The results are stored in separate lists and then concatenated together to form final list of casts

In [10]:
# Extracting cast values
namelist=[]
oddnames=[]
evennames =[]
ttag = soup2.find_all('tr', class_= 'odd')
for i in ttag:
    namelist.append( i.find('td', attrs={'class': None}))

for i in namelist:
    if (i.a) is None:
        pass
    else:
        oddnames.append(i.a.string.strip())
        
ttag = soup2.find_all('tr', class_= 'even')
namelist=[]

for i in ttag:
    namelist.append( i.find('td', attrs={'class': None}))

for i in namelist:
    if (i.a) is None:
        pass
    else:
        evennames.append(i.a.string.strip())

cast = oddnames+evennames
cast

['Peter Dinklage',
 'Emilia Clarke',
 'Sophie Turner',
 'Nikolaj Coster-Waldau',
 'John Bradley',
 'Conleth Hill',
 'Gwendoline Christie',
 'Isaac Hempstead Wright',
 'Nathalie Emmanuel',
 'Daniel Portman',
 'Ben Crompton',
 'Julian Glover',
 'Charles Dance',
 'Lena Headey',
 'Kit Harington',
 'Maisie Williams',
 'Iain Glen',
 'Alfie Allen',
 'Liam Cunningham',
 'Aidan Gillen',
 'Rory McCann',
 'Jerome Flynn',
 'Jacob Anderson',
 'Kristofer Hivju',
 'Carice van Houten',
 'Hannah Murray']

#### Director details in imdb page are given as Director/Directors/Creator/Creators. Some shows have used the term creators whereas some have used director. Thus if we encounter any such word-> We extract the values

In [11]:
# Directors
directors=[]
director_container = soup2.find('div',class_ = 'credit_summary_item')

if((director_container.find('h4').string =='Director:') or (director_container.find('h4').string =='Creator:') or
(director_container.find('h4').string =='Creators:') or (director_container.find('h4').string =='Directors:')) :
    directors_container = director_container.find_all('a')
    for i in directors_container:
        directors.append(i.string)
else:
    pass
    
directors
    



['David Benioff', 'D.B. Weiss']

### Now we find the country of production for the movie. For some movies, there may be more than one country. Thus we append all countries in a list

In [12]:
#country 
country=[]
titledetails = soup2.find('div',id='titleDetails')
titleheading = titledetails.find_all('a')
for i in range(0,len(titleheading)):
    if('country_of_origin' in titleheading[i]['href']):
        country.append(titleheading[i].string)
country

['USA', 'UK']

### Language tells the languages, the movie is available in. There may be more than one language for a movie( due to dubbing), thus we store the results in a list

In [13]:
#language
languages=[]
titledetails2 = soup2.find('div',id='titleDetails')
titleheading2 = titledetails2.find_all('a')
for i in range(0,len(titleheading2)):
    if('primary_language' in titleheading2[i]['href']):
        languages.append(titleheading2[i].string)
languages

['English']

###  Below is a way to find according to an attribute in tag-> consider that as dictionary

### The number of votes in an imdb page is written in the form '250,000)-> This cant be treated as integer value until we replace , with "". Also some movie may not have votes mentioned on imdb page. Hence the try except block.

In [14]:
#Total number of votes
votecounter=1
try:
    votes = soup2.find('span', {'itemprop':'ratingCount'}).string
    votes = votes.replace(',',"")
    votes = int(votes)
except Exception as e:
    votecounter=0
if(votecounter!=0):
    print(votes)
else:
    print("Number of votes are not given on imdb page")


1685619


#### next_sibling used to get the element in the same level of the tree. You cant find parent or children of a node through this tag but you can find its siblings. next_element also works in a similar way except that it prints the tag which was parsed just after this tag. It may or may not return next_sibling value depending on the next parsed element

In [15]:
# release date
dateheading = soup2.select('.txt-block h4')
for i in range(0,len(dateheading)):
    if(dateheading[i].string=='Release Date:'):
        print(dateheading[i].next_sibling.strip())
        


22 August 2011 (India)


#### A movie or a show can belong to multiple genres-> Hence the list 

In [16]:
# genres
genres=[]
wrapper = soup2.select_one('.title_wrapper .subtext')
links = wrapper.find_all('a')
for i in links:
    if 'genres' in i['href']:
        genres.append(i.string)
        
genres

['Action', 'Adventure', 'Drama']

### Two concepts of duration are used-> The code just below tells run time of a movie or avg run time of an episode in a tv show. Whereas the code below this cell, tells the run time of a movie or number of seasons in a tv show. So both of these cells have their separate usage

In [17]:
#duration-> This considers avg duration shown on imdb page for series
wrapper = soup2.select_one('.title_wrapper .subtext')
duration = wrapper.find('time').string.strip()
duration

'57min'

#### On IMDB page, a tv show always has TV as its starting word in the duration block . Thus we can use an if block wich checks that the duration block starts with TV or not. If it starts, we extract the number of seasons else its a movie and we extract its runtime. Thus this code has a separate function based off the type- Movie/TV Show

In [18]:
# Type, Duration/Seasons
durncounter=1
wrapper = soup2.select_one('.title_wrapper .subtext')
links = wrapper.find_all('a')
links
for i in range(0,len(links)):
    if 'releaseinfo' in links[i]['href']:
        value = links[i].string.strip()
if value.startswith('TV '):
    type = 'TV Series'
    seasons = soup2.select('.seasons-and-year-nav a')[0].string
    duration = seasons+" seasons"
else:
    type='Movie'
    try:
        duration = wrapper.find('time').string.strip()
    except Exception as e:
        durncounter=0
    

if(type == 'Movie') and durncounter!= 0:
    print("The duration is", duration)
elif(type == 'TV Series'):
    print("The number of seasons are ", duration)
else:
    print(" Duration is not given on imdb page")


The number of seasons are  8 seasons


#### On IMDB page, a storyline or short summary of the plot is given for movie/tv shows. However it may also happen that no such plot is given for a movie. Thus if our extracted storyline is not an empty string, we print it.

#### Also some of the storyline have some links placed inside them( "anchor tag"). So normal (.strings ) wont work, we use get_text


In [20]:
# Storyline
# Here we have links inside story for some movies. So normal .strings wont work
story = soup2.select_one('#titleStoryLine div span').get_text().strip()
story
if(story!=''):
    print("\nHere is a description of the storyline - \n", story)
else:
    print("\nStory description is not given on imdb page")
    


Here is a description of the storyline - 
 In the mythical continent of Westeros, several powerful families fight for control of the Seven Kingdoms. As conflict erupts in the kingdoms of men, an ancient enemy rises once again to threaten them all. Meanwhile, the last heirs of a recently usurped dynasty plot to take back their homeland from across the Narrow Sea.


### certificate tells the certificate or censor rating given to a movie/tv show.

In [21]:
# Certificate
wrapper = soup2.select_one('.title_wrapper .subtext')
certificate = wrapper.next_element.strip()
if(certificate != ''):
    print("\nThe {} has been given {} certificate".format(type,certificate))
else:
    print("Certificate details are not given on the imdb page")



The TV Series has been given A certificate


### COMBINING ALL THE CODE -> You need to restart the kernel to enter the movie/tv series everytime you execute this block 

In [1]:
from bs4 import BeautifulSoup
import requests
input = input("Enter your value: ") 
response = requests.get('http://www.imdb.com/find?ref_=nv_sr_fn&q=' +input+ ' &s=tt')

soup = BeautifulSoup(response.content, 'lxml')
table = soup.find('table',class_='findList')
movieid = table.tr.a['href']

movielink = "http://www.imdb.com" + movieid
moviepage = requests.get(movielink)
soup2 = BeautifulSoup(moviepage.content, 'lxml')


#title of movie/show
uncleaned = soup2.find('div', class_ = 'title_wrapper').h1.text
uncleaned =uncleaned.replace('\xa0','')
title = uncleaned.strip()
print("Title is - {}".format(title))

# Type, Duration/Seasons
durncounter=1
wrapper = soup2.select_one('.title_wrapper .subtext')
links = wrapper.find_all('a')
links
for i in range(0,len(links)):
    if 'releaseinfo' in links[i]['href']:
        value = links[i].string.strip()
if value.startswith('TV '):
    type = 'TV Series'
    seasons = soup2.select('.seasons-and-year-nav a')[0].string
    duration = seasons+" seasons"
else:
    type='Movie'
    try:
        duration = wrapper.find('time').string.strip()
    except Exception as e:
        durncounter=0
    

if(type == 'Movie') and durncounter!= 0:
    print("\nThe duration is", duration)
elif(type == 'TV Series'):
    print("\nThe number of seasons are ", duration)
else:
    print("\nDuration is not given on imdb page")

#imdb rating
imdbcounter=1
try:
    imdb_rating = float(soup2.select('.ratingValue span')[0].text)
except Exception as e:
    imdbcounter=0
if(imdbcounter !=0):
    print("\nThe imdb rating of the {} is = {}".format(type,imdb_rating))
else:
    print("\nimdb rating is not given on imdb page")


#Total number of votes
votecounter=1
try:
    votes = soup2.find('span', {'itemprop':'ratingCount'}).string
    votes = votes.replace(',',"")
    votes = int(votes)
except Exception as e:
    votecounter=0
if(votecounter!=0):
    print("\nThe imdb rating is calculating on the basis of {} mumber of votes".format( votes))
else:
    print("\nNumber of votes are not given on imdb page")

#metascore    
try:
    metascore = float(soup2.select('.metacriticScore span')[0].text) if float(soup2.select('.metacriticScore span')[0].text) else None
    print("\nMetascore - ",metascore)
except Exception as e:
    metascore=None
    print("\nMetascore is not available in imdb site")

# cast values
namelist=[]
oddnames=[]
evennames =[]
ttag = soup2.find_all('tr', class_= 'odd')
for i in ttag:
    namelist.append( i.find('td', attrs={'class': None}))

for i in namelist:
    if (i.a) is None:
        pass
    else:
        oddnames.append(i.a.string.strip())
        
ttag = soup2.find_all('tr', class_= 'even')
namelist=[]

for i in ttag:
    namelist.append( i.find('td', attrs={'class': None}))

for i in namelist:
    if (i.a) is None:
        pass
    else:
        evennames.append(i.a.string.strip())

cast = oddnames+evennames
print("\nCast is as follows",cast)

# Extracting directors
directors=[]
director_container = soup2.find('div',class_ = 'credit_summary_item')

if((director_container.find('h4').string =='Director:') or (director_container.find('h4').string =='Creator:') or
(director_container.find('h4').string =='Creators:') or (director_container.find('h4').string =='Directors:')) :
    directors_container = director_container.find_all('a')
    for i in directors_container:
        directors.append(i.string)
    print("\nDirectors are", directors)
else:
    print("\nDirector information not given in imdb site")

# Country of production
country=[]
titledetails = soup2.find('div',id='titleDetails')
titleheading = titledetails.find_all('a')
for i in range(0,len(titleheading)):
    if('country_of_origin' in titleheading[i]['href']):
        country.append(titleheading[i].string)
print("\nProduction Countries-", country)
        
# Language
languages=[]
titledetails2 = soup2.find('div',id='titleDetails')
titleheading2 = titledetails2.find_all('a')
for i in range(0,len(titleheading2)):
    if('primary_language' in titleheading2[i]['href']):
        languages.append(titleheading2[i].string)
print("\n{} is available in Languages- {}".format(type, languages))

# release date
dateheading = soup2.select('.txt-block h4')
for i in range(0,len(dateheading)):
    if(dateheading[i].string=='Release Date:'):
        releasedate = dateheading[i].next_sibling.strip()
print("\n{} was released in- {}".format(type, releasedate))

        
# genres
genres=[]
wrapper = soup2.select_one('.title_wrapper .subtext')
links = wrapper.find_all('a')
for i in links:
    if 'genres' in i['href']:
        genres.append(i.string)
print("\nFollowing are the genres of the {} - {}".format(type,genres))

# Certificate
wrapper = soup2.select_one('.title_wrapper .subtext')
certificate = wrapper.next_element.strip()
if(certificate != ''):
    print("\nThe {} has been given {} certificate".format(type,certificate))
else:
    print("\nCertificate details are not given on the imdb page")
    
# Storyline
story = soup2.select_one('#titleStoryLine div span').get_text().strip()
if(story!=''):
    print("\nHere is a description of the storyline - \n", story)
else:
    print("\nStory description is not given on imdb page")
    

Enter your value: Game Of Thrones
Title is - Game of Thrones

The number of seasons are  8 seasons

The imdb rating of the TV Series is = 9.3

The imdb rating is calculating on the basis of 1685619 mumber of votes

Metascore is not available in imdb site

Cast is as follows ['Peter Dinklage', 'Emilia Clarke', 'Sophie Turner', 'Nikolaj Coster-Waldau', 'John Bradley', 'Conleth Hill', 'Gwendoline Christie', 'Isaac Hempstead Wright', 'Nathalie Emmanuel', 'Daniel Portman', 'Ben Crompton', 'Julian Glover', 'Charles Dance', 'Lena Headey', 'Kit Harington', 'Maisie Williams', 'Iain Glen', 'Alfie Allen', 'Liam Cunningham', 'Aidan Gillen', 'Rory McCann', 'Jerome Flynn', 'Jacob Anderson', 'Kristofer Hivju', 'Carice van Houten', 'Hannah Murray']

Directors are ['David Benioff', 'D.B. Weiss']

Production Countries- ['USA', 'UK']

TV Series is available in Languages- ['English']

TV Series was released in- 22 August 2011 (India)

Following are the genres of the TV Series - ['Action', 'Adventure', 'Dr