Skip to content

Commit

Permalink
#3 Renamed get_full_credits to get_full_crew and created a combined c…
Browse files Browse the repository at this point in the history
…ast and crew method
  • Loading branch information
zembrodt committed Nov 20, 2019
1 parent c228f2a commit 7303a65
Show file tree
Hide file tree
Showing 2 changed files with 360 additions and 109 deletions.
94 changes: 55 additions & 39 deletions pymdb/scraper.py
Expand Up @@ -89,13 +89,30 @@ def get_title(self, title_id, include_taglines=False):
worldwide_gross = None

# Get title text
title_text_node = tree.css_first('div.title_wrapper > h1')
if title_text_node:
# Remove title year
title_year_node = title_text_node.css_first('span#titleYear')
if title_year_node:
title_year_node.decompose()
display_title = title_text_node.text().strip()
title_node = tree.css_first('div.title_wrapper')
if title_node:
display_title_node = title_node.css_first('h1')
if display_title_node:
# Remove title year
title_year_node = display_title_node.css_first('span#titleYear')
if title_year_node:
title_year_node.decompose()
display_title = display_title_node.text().strip()
title_info_node = title_node.css_first('div.subtext')
if title_info_node:
# If this is a TV series, get the year the show ended
for link_node in title_info_node.css('a'):
if 'href' in link_node.attributes and 'releaseinfo' in link_node.attributes['href']:
series_dates_match = re.search(r'[\d]{4}[-–][\d]{4}', link_node.text())
if series_dates_match:
end_year_split = re.sub(r'[-–]', '\t', series_dates_match.group(0)).split('\t')
if len(end_year_split) > 1:
end_year = end_year_split[1]
break

# Get MPAA Rating
title_info_node.strip_tags(['span', 'a', 'time'])
rating = re.sub(r'(\s|,)*', '', title_info_node.text()).strip()

# Get title parent (if TV episode)
title_parent_node = tree.css_first('div.titleParent > a')
Expand Down Expand Up @@ -123,24 +140,6 @@ def get_title(self, title_id, include_taglines=False):
# TODO: should a Tagline object be created that stores the note for each tagline separately?
taglines.append(tagline_node.text().strip())

title_node = tree.css_first('div.title_block > div > div > div.title_wrapper > div.subtext')
if title_node:
# If this is a TV series, get the year the show ended
for link_node in title_node.css('a'):
if 'href' in link_node.attributes and 'releaseinfo' in link_node.attributes['href']:
series_dates_match = re.search(r'[\d]{4}[-–][\d]{4}', link_node.text())
if series_dates_match:
end_year_split = re.sub(r'[-–]', '\t', series_dates_match.group(0)).split('\t')
if len(end_year_split) > 1:
end_year = end_year_split[1]
break

# Get MPAA Rating
title_node.strip_tags(['span', 'a', 'time'])
rating_node = tree.css_first('div.titleBar > div.title_wrapper > div.subtext')
if rating_node:
rating = re.sub(r'(\s|,)*', '', rating_node.text()).strip()

# Parse through text blocks
text_block_nodes = tree.css('div#titleDetails > div.txt-block')
for text_block_node in text_block_nodes:
Expand Down Expand Up @@ -197,7 +196,6 @@ def get_title(self, title_id, include_taglines=False):
#characters = []
#for c_node in character_nodes:
# characters.append(c_node.text().strip())
#cast_members.append(f'{cast_member_name} ({cast_member_id}): {", ".join(characters)}')
cast_members.append(cast_member_id)

# Get season and episode numbers if TV episode
Expand Down Expand Up @@ -300,20 +298,14 @@ def get_full_cast(self, title_id, include_episodes=False):

episode_nodes = episodes_tree.css('div.filmo-episodes')
for episode_node in episode_nodes:
episode_id = None
episode_id = get_title_id(episode_node.css_first('a'))
episode_year = None
episode_credit = None

episode_title_info = episode_node.css_first('a')
if episode_title_info and 'href' in episode_title_info:
episode_id_match = re.search(r'tt\d+', episode_title_info.attributes['href'])
if episode_id_match:
episode_id = episode_id_match.group(0)

episode_info = episode_node.text().strip().split('...')
if len(episode_info) > 1:
episode_year_info = episode_info[0]
episode_credit = '...'.join(episode_info[1:])
episode_credit = '...'.join(episode_info[1:]).strip()
else:
episode_year_info, = episode_info

Expand Down Expand Up @@ -350,17 +342,17 @@ def get_full_cast(self, title_id, include_episodes=False):
episode_year_end=episode_year_end
)

def get_full_credits(self, title_id):
"""Scrapes the full list of credited people for a title, not including actors.
def get_full_crew(self, title_id):
"""Scrapes the full list of credited crew people for a title, not including actors.
Will scrape all the credited members of a title, without the actors. For example, this will
Will scrape all the credited crew members of a title, without the actors. For example, this will
include all directors, writers, producers, cinematographers, etc.
Args:
title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`.
Yields:
:class:`~.models.title.CreditScrape`: An object for each credited member in the title.
:class:`~.models.title.CreditScrape`: An object for each credited crew member in the title.
Raises:
HTTPError: If the request failed.
Expand Down Expand Up @@ -430,6 +422,30 @@ def get_full_credits(self, title_id):
)
found_title = False # only because we use continue when set to True for now...

def get_full_credits(self, title_id, include_episodes=False):
"""Scrapes the full list of credited people for a title.
Will scrape all the cast and crew for a title by returning both
:obj:`~.scraper.PyMDbScraper.get_full_cast` and :obj:`~.scraper.PyMDbScraper.get_full_crew` as a single generator.
An optional argument `include_episodes` will also scrape each episode an actor is in
if the title is a TV series.
Args:
title_id (:obj:`str`): The title's ID used by IMDb prefixed with `tt`.
include_episodes (:obj:`bool`, optional): Specify if individual episodes of a
TV series should also be scraped.
Yields:
:class:`~.models.title.CreditScrape`: An object for each credited crew member in the title.
Raises:
HTTPError: If the request failed.
"""
for cast_member in self.get_full_cast(title_id, include_episodes=include_episodes):
yield cast_member
for crew_member in self.get_full_crew(title_id):
yield crew_member

def get_name(self, name_id, include_known_for_titles=False):
"""Scrapes detailed information from a person's personal IMDb web page.
Expand All @@ -438,7 +454,7 @@ def get_name(self, name_id, include_known_for_titles=False):
Args:
name_id (:obj:`str`): The person's ID used by IMDb prefixed with `nm`.
include_known_for_titles (:obj:`bool`, optional): Determines if an optional request should
include_known_for_titles (:obj:`bool`, optional): Determines if an second request should
be sent to get the known for titles on a person's default IMDb page.
Returns:
Expand Down

0 comments on commit 7303a65

Please sign in to comment.