-
-
Notifications
You must be signed in to change notification settings - Fork 1.1k
/
Copy pathscraper.py
86 lines (74 loc) · 3.5 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import requests
from bs4 import BeautifulSoup as bs
import argparse
parser = argparse.ArgumentParser(description='IMDB Scraper')
parser.add_argument('--t', action='store', type=str, required=True,
help='Enter the title of the movie')
# Base id url is used when the title id is known
base_id = "https://www.imdb.com/title"
# base url is used when the user gives a title to search for
base = "https://www.imdb.com/find?s=tt&q="
def get_info(soup):
info = {}
labels = ["title", "year", "rating", "genre", "plot", "date", "country",
"language", "budget", "gross", "gross_usa", "opening_week_usa"]
try:
info["title"] = soup.find(
'div', attrs={"class": "title_wrapper"}).h1.get_text(strip=True)
info["year"] = soup.find(
'span', attrs={"id": "titleYear"}).a.get_text(strip=True)
info["rating"] = soup.find(
'span', attrs={"itemprop": "ratingValue"}).get_text(strip=True)
subtext = soup.find("div", attrs={"class": "subtext"})
info["genre"] = subtext.a.get_text(strip=True)
article = soup.find('div', attrs={"id": "titleStoryLine"})
info["plot"] = article.find(
'div', attrs={"class": "canwrap"}).p.span.get_text(strip=True)
details = soup.find('div', attrs={"id": "titleDetails"})
blocks = details.findAll('div', attrs={"class": "txt-block"})
for block in blocks:
heading = block.h4.get_text(strip=True)
if heading == "Release Date:":
info["date"] = block.get_text(strip=True).replace(
"See more»", '').replace(heading, '')
if heading == "Country:":
info["country"] = block.a.get_text(strip=True)
if heading == "Language":
info["language"] = block.a.get_text(strip=True)
if heading == "Budget:":
info["budget"] = block.get_text(
strip=True).replace(heading, '')
if heading == "Cumulative Worldwide Gross:":
info["gross"] = block.get_text(
strip=True).replace(heading, '')
if heading == "Gross USA:":
info["gross_usa"] = block.get_text(
strip=True).replace(heading, '')
if heading == "Opening Weekend USA:":
info["opening_week_usa"] = block.get_text(
strip=True).replace(heading, '')
except:
assert any(obj in labels for obj in info), "No info found"
if len(info) > 4:
print(info, end="\n\n\n")
def find_movie(query):
url = base+query
resp = requests.get(url)
# for parsing we have used the lxml parser for optimization purposes, if lxml does not work for you replace 'lxml' with 'html.parser'
soup1 = bs(resp.text, 'lxml')
# Since for every query imdb gives about 150-200 responses , we choose the top 5 and return the details for them
movie_list = soup1.findAll("tr", attrs={"class": "findResult"})[0:5]
if movie_list:
for movie in movie_list:
# Through the table given , we extract the title id from the 'href' attribute of the <a> tag
title_id = movie.find(
'td', attrs={"class": "result_text"}).a.attrs["href"][6:]
url = base_id+title_id
respo = requests.get(base_id+title_id)
soup = bs(respo.text, 'lxml')
get_info(soup)
else:
print("No results found")
if __name__ == "__main__":
args = parser.parse_args()
find_movie(args.t)