## Crawl popular websites & create a database of indian movie celebrities with their images and personality traits

## Reading the web page into Python


In [4]:

import re
import csv
import requests
from bs4 import BeautifulSoup
url=('https://www.imdb.com/list/ls068010962/')

In [5]:
resp = requests.get(url)

## Parsing the HTML using Beautiful Soup



In [6]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.text, 'html.parser')

In [7]:
results = soup.find_all('div', attrs={'class':'lister-item-content'})

In [8]:
len(results)

100

In [9]:
results[0:3]

[<div class="lister-item-content">
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">1. </span>
 <a href="/name/nm0451148"> Aamir Khan
 </a> </h3>
 <p class="text-muted text-small">
                         Actor <span class="ghost">|</span>
 <a href="/title/tt0986264/"> Taare Zameen Par
 </a> </p>
 <p>
     Aamir is no doubt one of the most dedicated actors in this world. With his recent success in India and China markets combined he has got the title of World's Biggest Superstar. He went through rapid transformations in his body structure for his films to bring out the reality factor in his ...                </p>
 </div>, <div class="lister-item-content">
 <h3 class="lister-item-header">
 <span class="lister-item-index unbold text-primary">2. </span>
 <a href="/name/nm0474774"> Akshay Kumar
 </a> </h3>
 <p class="text-muted text-small">
                         Actor <span class="ghost">|</span>
 <a href="/title/tt0242519/"> Hera Pheri
 </a> </p>


## Extracting the date



In [10]:
first_result = results[0]
first_result

<div class="lister-item-content">
<h3 class="lister-item-header">
<span class="lister-item-index unbold text-primary">1. </span>
<a href="/name/nm0451148"> Aamir Khan
</a> </h3>
<p class="text-muted text-small">
                        Actor <span class="ghost">|</span>
<a href="/title/tt0986264/"> Taare Zameen Par
</a> </p>
<p>
    Aamir is no doubt one of the most dedicated actors in this world. With his recent success in India and China markets combined he has got the title of World's Biggest Superstar. He went through rapid transformations in his body structure for his films to bring out the reality factor in his ...                </p>
</div>

In [11]:
first_result.find('a')

<a href="/name/nm0451148"> Aamir Khan
</a>

In [12]:
first_result.find('a').text

' Aamir Khan\n'

In [13]:
first_result.find('a').text[0:-1]

' Aamir Khan'

## Extracting the lmage url


In [14]:
result2=soup.find('img', {'src':re.compile('.jpg')})['src']

## Extracting the Actor/actress

In [15]:
#first_result.find('p',attrs={'class':'text-muted text-small',}).text.strip()[0:5]


In [16]:
first_result.find('p').text[1:-1].strip()[0:7].replace("|"," ")

'Actor  '

In [17]:
first_result.find('p').text[1:-1].strip()[9:]

'Taare Zameen Par'

## Building the dataset

Now that we've figured out how to extract the four components of `first_result`, we can **create a loop to repeat this process** on all 116 `results`. We'll store the output in a **list of tuples** called `records`:

In [18]:
records = []
for result in results:
    Name = result.find('a').text[0:-1]
    Image_url = soup.find('img', {'src':re.compile('.jpg')})['src']
    celebrity =result.find('p').text[1:-1].strip()[0:7].replace("|"," ")
    movie = result.find('p').text[1:-1].strip()[9:]
    records.append((Name, Image_url, celebrity, movie))

In [19]:
len(records)

100

In [20]:
records[0:3]

[(' Aamir Khan',
  'https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg',
  'Actor  ',
  'Taare Zameen Par'),
 (' Akshay Kumar',
  'https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg',
  'Actor  ',
  'Hera Pheri'),
 (' Ajay Devgn',
  'https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg',
  'Actor  ',
  'Shivaay')]

## Applying a tabular data structure


In [23]:
import pandas as pd
df = pd.DataFrame(records, columns=[Name, Image_url, celebrity, movie])

In [24]:
df.head()

Unnamed: 0,Navin Nischol,"https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg",Actor,Zorro
0,Aamir Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Taare Zameen Par
1,Akshay Kumar,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Hera Pheri
2,Ajay Devgn,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Shivaay
3,Amjad Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Sholay
4,Amitabh Bachchan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Black


In [25]:
df.loc[0]

 Navin Nischol                                                                                                                                                 Aamir Khan
https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg    https://m.media-amazon.com/images/M/MV5BMjAwMj...
Actor                                                                                                                                                             Actor  
Zorro                                                                                                                                                    Taare Zameen Par
Name: 0, dtype: object

In [26]:
df=df.rename(columns={" Navin Nischol":"Name","https://m.media-amazon.com/images/M/MV5BMjAwMjk3NDUzN15BMl5BanBnXkFtZTcwNjI4MTY0NA@@._V1_UX140_CR0,0,140,209_AL_.jpg":"Image_url","Zorro":"Movie Name"})

In [27]:
df.head()

Unnamed: 0,Name,Image_url,Actor,Movie Name
0,Aamir Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Taare Zameen Par
1,Akshay Kumar,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Hera Pheri
2,Ajay Devgn,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Shivaay
3,Amjad Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Sholay
4,Amitabh Bachchan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Black


In [28]:
df.tail()

Unnamed: 0,Name,Image_url,Actor,Movie Name
95,Nargis,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actress,\n Raat Aur Din
96,Nirupa Roy,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actress,\n Chhaya
97,Nutan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actress,\n Seema
98,Neeraj Kabi,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Ship of Theseus
99,Navin Nischol,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Zorro


## Exporting the dataset to a CSV file


In [29]:
df.to_csv('FinalOutput.csv', index=False, encoding='utf-8')

In [30]:
df = pd.read_csv('FinalOutput.csv')
df.head()

Unnamed: 0,Name,Image_url,Actor,Movie Name
0,Aamir Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Taare Zameen Par
1,Akshay Kumar,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Hera Pheri
2,Ajay Devgn,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Shivaay
3,Amjad Khan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Sholay
4,Amitabh Bachchan,https://m.media-amazon.com/images/M/MV5BMjAwMj...,Actor,Black
