# Solution Key

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Problem 1 : Write a python program to display all the header tags    from‘en.wikipedia.org/wiki/Main_Page’

In [81]:
res = requests.get('https://en.wikipedia.org/wiki/Main_Page')


In [3]:
# We look through the wiki article via Inspect and find that there are multiple headings with no common text to search with.
# So we use the various tags namely: <title>, <h1>, <h2>, and <h3> to get every header tag
# One <h2> and one <h3> tag do not have a class so we use neighouring tags with attributes like <div> and <label>
soup = BeautifulSoup(res.text)

title=soup.find('title')

# 'firstHeading was the keyword used to locate <h1>'
heading_1=soup.find(class_='firstHeading')

# Similarly for <h2> and <h3> tags
heading_2=soup.find_all(class_="mp-h2")
heading_2_no_class=soup.find('div', id='mw-navigation')
heading_3=soup.find_all(class_='vector-menu-heading')
heading_3_no_class=soup.find('label')

In [4]:
# We create a heading list to add every heading to
heading_list=[]

# First we add the title, 'Wikipedia, the free encyclopedia' 
heading_list.append(title.text)

# Then the only entry from <h1> is added
heading_list.append(heading_1.text)

# Then all entries from <h2> tag with class are added
for i in heading_2:
    heading_list.append(i.text)

# Then the <h2> entry with no class is added.  
heading_list.append(heading_2_no_class.h2.get_text())

# Then all entries from <h3> tag with class are added
for i in heading_3:
    heading_list.append(i.text)
    
# Then the <h2> entry with no class is added.
heading_list.append(heading_3_no_class.get_text())

In [5]:
# We get the headings list. Note that there are extra characters and spaces which we remove in the next step
heading_list

['Wikipedia, the free encyclopedia',
 'Main Page',
 "From today's featured article",
 'Did you know\xa0...',
 'In the news',
 'On this day',
 "From today's featured list",
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages',
 'Navigation menu',
 ' Personal tools\n',
 ' Namespaces\n',
 ' Variants\nexpanded\ncollapsed\n',
 ' Views\n',
 ' More\nexpanded\ncollapsed\n',
 ' Navigation\n',
 ' Contribute\n',
 ' Tools\n',
 ' Print/export\n',
 ' In other projects\n',
 ' Languages\n',
 'Search']

In [6]:
heading_list_final=[]
for i in heading_list:
    # i.strip() to remove whitespaces
    i=i.strip()
    
    # i.split('\n') splits everything after '\n' and returns a list and we use [0] to get the first element in the list
    i=i.split('\n',1)[0]
    
    # Find and remove '\xa0' in the 4th element
    if i.find('\xa0'):
        i=i.replace('\xa0','')
    
    # Finally adding the elements to a new list
    heading_list_final.append(i)

In [7]:
heading_list_final

['Wikipedia, the free encyclopedia',
 'Main Page',
 "From today's featured article",
 'Did you know...',
 'In the news',
 'On this day',
 "From today's featured list",
 "Today's featured picture",
 'Other areas of Wikipedia',
 "Wikipedia's sister projects",
 'Wikipedia languages',
 'Navigation menu',
 'Personal tools',
 'Namespaces',
 'Variants',
 'Views',
 'More',
 'Navigation',
 'Contribute',
 'Tools',
 'Print/export',
 'In other projects',
 'Languages',
 'Search']

In [16]:
# Let us make a DataFrame of this list and align on left
headers_df = pd.DataFrame(heading_list_final, columns=['Headers'])
headers_df = headers_df.style.set_properties(**{'text-align':'left'})
headers_df = headers_df.set_table_styles([dict(selector='th', props=[('text-align', 'left')])])
headers_df

Unnamed: 0,Headers
0,"Wikipedia, the free encyclopedia"
1,Main Page
2,From today's featured article
3,Did you know...
4,In the news
5,On this day
6,From today's featured list
7,Today's featured picture
8,Other areas of Wikipedia
9,Wikipedia's sister projects


### Problem 2: Write a python program to display IMDB’s Top rated 100 movies’ data.

In [86]:
# This will include two pages as the top 100 movies are split in two halves of 50
page = requests.get('https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc')
soup = BeautifulSoup(page.content)


page_2 = requests.get('https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc&start=51&ref_=adv_nxt')
soup_2 = BeautifulSoup(page_2.content)

In [87]:
storage=soup.find_all('h3', class_='lister-item-header')                  # 1-50

storage_2=soup_2.find_all('h3', class_='lister-item-header')              # 51-100

name_list=[]
for i in storage:
    for j in i.find_all('a'):
        name_list.append(j.text)

for i in storage_2:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
len(name_list)

100

In [88]:
storage=soup.find_all('h3', class_='lister-item-header')

storage_2=soup_2.find_all('h3', class_='lister-item-header')

year_list=[]
for i in storage:
    for j in i.find_all('span', class_='lister-item-year text-muted unbold'):
        
        # We had results in brackets like (2007), we deal with this by removing first and last character and converting string into integer
        year_list.append(int(j.text[1:-1]))

# We create year_list_2 for reasons stated below
year_list_2=[]
for i in storage_2:
    for j in i.find_all('span', class_='lister-item-year text-muted unbold'):
        year_list_2.append((j.text[1:-1]))

# We did not make the items in storage_2 into integers because of an error in the webpage
year_list_2[10]

'I) (2017'

In [89]:
# Let us change that element in the list into an integer by removing the first three characters
year_list_2[10] = year_list_2[10][4:]

# Now to convert every element in year_list_2 into an integer and transfer back to year_list
for i in year_list_2:
    year_list.append(int(i))
    
len(year_list)

100

In [90]:
storage=soup.find_all('div', class_='inline-block ratings-imdb-rating')

storage_2=soup_2.find_all('div', class_='inline-block ratings-imdb-rating')

rating_list=[]
for i in storage:
    for j in i.find_all('strong'):
        
        # Convert string into float values
        rating_list.append(float(j.text))
        
for i in storage_2:
    for j in i.find_all('strong'):
        rating_list.append(float(j.text))

len(rating_list)

100

In [91]:
# Since the length of all three lists is the same we can move forward with creating a DataFrame
imdb_top_100 = pd.DataFrame({})

imdb_top_100['Name'] = name_list  
imdb_top_100['IMDB Rating'] = rating_list 
imdb_top_100['Year of Release'] = year_list 

# Let's make The Shawshank Redemption index 1 instead of 0
imdb_top_100.index += 1

imdb_top_100

Unnamed: 0,Name,IMDB Rating,Year of Release
1,The Shawshank Redemption,9.3,1994
2,The Godfather,9.2,1972
3,The Dark Knight,9.0,2008
4,The Godfather: Part II,9.0,1974
5,12 Angry Men,9.0,1957
...,...,...,...
96,North by Northwest,8.3,1959
97,Vertigo,8.3,1958
98,Singin' in the Rain,8.3,1952
99,Citizen Kane,8.3,1941


### Problem 3: Write a python program to display IMDB’s Top rated 100 Indian movies’ data.
##### Note: The question mentions IMDB’s Top rated 100 Indian movies, not top Hindi movies

In [44]:
page = requests.get('https://www.imdb.com/india/top-rated-indian-movies/')
soup = BeautifulSoup(page.content)

In [48]:
storage = soup.find_all('td', class_ = 'titleColumn')

name_list = []

# We only need the first 100 movies
for i in storage[:100]:
    for j in i.find_all('a'):
        name_list.append(j.text)
        
len(name_list)

100

In [51]:
storage = soup.find_all('td', class_ = 'titleColumn')

year_list = []

# We only need the first 100 movies
for i in storage[:100]:
    for j in i.find_all('span'):
        
        # To remove the brackets at each end and convert to integer 
        year_list.append(int(j.text[1:-1]))
        
len(year_list)

100

In [56]:
storage = soup.find_all('td', class_ = 'ratingColumn imdbRating')

rating_list = []

# We only need the first 100 movies
for i in storage[:100]:
    for j in i.find_all('strong'):
        
        # To convert to float 
        rating_list.append(float(j.text))
        
len(rating_list)

100

In [57]:
# Since the length of all three lists is the same we can move forward with creating a DataFrame
imdb_Indian = pd.DataFrame({})

imdb_Indian['Name'] = name_list  
imdb_Indian['IMDB Rating'] = rating_list 
imdb_Indian['Year of Release'] = year_list 

# Let's make The Shawshank Redemption index 1 instead of 0
imdb_Indian.index += 1

imdb_Indian

Unnamed: 0,Name,IMDB Rating,Year of Release
1,Nayakan,8.5,1987
2,Anbe Sivam,8.5,2003
3,Pariyerum Perumal,8.5,2018
4,C/o Kancharapalem,8.5,2018
5,Golmaal,8.5,1979
...,...,...,...
96,Rang De Basanti,8.1,2006
97,OMG: Oh My God!,8.1,2012
98,Roja,8.1,1992
99,Uri: The Surgical Strike,8.1,2019


### Problem 4: Write a python program to scrap book name, author name, genre and book review of any 5 books from www.bookpage.com

In [59]:
page = requests.get('https://bookpage.com/reviews')
soup = BeautifulSoup(page.content)

In [79]:
storage = soup.find_all('h4', class_ = 'italic')

book_list = []

# We only need the 5 books
for i in storage[:5]:
    for j in i.find_all('a'):
        
        # We remove the star symbols next to each name
        j = j.text.replace('★', '')
        
        book_list.append(j)
        
len(book_list)

5

In [77]:
storage = soup.find_all('p', class_ = 'sans bold')

author_list = []

# We only need the 5 books
for i in storage[:5]:
    
    # Let us remove the \n from each text
    i = i.text.replace('\n', '')
    
    author_list.append(i)
        
len(author_list)

5

In [66]:
storage = soup.find_all('p', class_ = 'genre-links hidden-phone')

genre_list = []


# We only need the 5 books
for i in storage[:5]:
    
    # Each book has multiple genres so we need to make nested lists, so we initialise a list for total genres per book
    genre_per_book = []
    
    for j in i.find_all('a'):
        genre_per_book.append(j.text)
        
    genre_list.append(genre_per_book)
        
(genre_list)

[["Children's", 'Picture Book'],
 ['Nonfiction', 'History', 'American History'],
 ['Fiction', 'Literary Fiction'],
 ['Mystery & Suspense', 'Mystery'],
 ['Audio', 'Nonfiction', 'Essays']]

In [70]:
storage = soup.find_all('div', class_ = 'read-full')

review_list = []

# We only need the 5 books
for i in storage[:5]:
    for j in i.find_all('a'):
        review_list.append(j.get('href'))
        
len(review_list)

5

In [80]:
# Since the length of all three lists is the same we can move forward with creating a DataFrame
book_review = pd.DataFrame({})

book_review['Book'] = book_list  
book_review['Author'] = author_list 
book_review['Genre'] = genre_list 
book_review['Review Link'] = review_list

# Let's make The Shawshank Redemption index 1 instead of 0
book_review.index += 1

book_review

Unnamed: 0,Book,Author,Genre,Review Link
1,Keep Your Head Up,"Aliya King Neil, Charly Palmer","[Children's, Picture Book]",/reviews/26702-aliya-king-neil-keep-your-head-...
2,The Cause,Joseph J. Ellis,"[Nonfiction, History, American History]",/reviews/26645-joseph-j-ellis-cause-nonfiction
3,The Book of Form and Emptiness,Ruth Ozeki,"[Fiction, Literary Fiction]",/reviews/26665-ruth-ozeki-book-form-emptiness-...
4,Daughter of the Morning Star,Craig Johnson,"[Mystery & Suspense, Mystery]",/reviews/26715-craig-johnson-daughter-morning-...
5,Carry On,"John Lewis, Don Cheadle","[Audio, Nonfiction, Essays]",/reviews/26652-john-lewis-carry-audio
