Adding numerical data to the dataset:
* Number of movies the actor has been in or director has directed
* Average score for the number of movies
* Number of award wins
* Number of award nominations

In [None]:
def searching_IMDb_person(name_column, dataset, person_type):
    """
    Updating the dataset by adding new column for a director/actor
    including; number of movies, commulative average rating for these
    movies, number of won awards, number of nominations. 
    
    Parameters:
    name_column : the column containing the person's name
    dataset : dataset
    person_type : specify if person is actor or director due to different
        search filters applied
    """
    # 1. Instantiating imdb 
    ia = imdb.IMDb()

    # 2. For each row of the df, extract the name for specific column
    for index, row in dataset.iterrows():
        name = row[name_column]
        person_id = None  # initialize person_id to None
        
        try:
            name_search = ia.search_person(name)
            person_id = name_search[0].getID()
        except:
            pass
        
        if person_id != None:
            awards_req = Request(
                url = f'https://www.imdb.com/name/nm{person_id}/awards/?ref_=nm_awd', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            awards_html = urlopen(awards_req).read()
            awards_soup = BeautifulSoup(awards_html, 'html.parser')

            # If awards and nominations exist then get data
            try:
                awards_line = awards_soup.find('div', {'class': 'desc'}).text.split()
                wins = get_value(awards_line[2])
                nominations = get_value(awards_line[5])
            except AttributeError:
                wins = None
                nominations = None

            # Movie history
            # Make a request to the URL and get the HTML
            movies_req = Request(
                url = f'https://www.imdb.com/filmosearch/?explore=title_type&role=nm{person_id}&ref_=filmo_ref_job_typ&sort=release_date,asc&mode=detail&page=1&job_type={person_type}', 
                headers={'User-Agent': 'Mozilla/5.0'}
            )
            movie_html = urlopen(movies_req).read()
            movie_soup = BeautifulSoup(movie_html, 'html.parser')

            # If movie history exists
            try:
                # Make a list of the webpage and create ratings_list
                movies_list = movie_soup.find_all('div', {'class': 'lister-item mode-detail'})
                ratings_list = []
                for movie in movies_list:
                    # For movie extract the year
                    year_raw = movie.find('span', {'class': 'lister-item-year'}).text.strip('()')
                    year_match = re.match(r'^\d{4}$', year_raw)
                    # Only keep movie rating if movie pre-2023
                    if year_match and int(year_raw) < 2023:
                        rating = movie.find('div', {'class': 'ratings-bar'}).find('div', {'class': 'inline-block ratings-imdb-rating'})
                        if rating is not None:
                            # Append rating to ratings_list
                            ratings_list.append(float(rating['data-value']))
            except AttributeError:
                ratings_list = []
        else:
            wins = None
            nominations = None
            ratings_list = []


        # Create new columns and append data to appropriate column
        dataset.loc[index, f'{name_column}_num_wins'] = wins
        dataset.loc[index, f'{name_column}_num_nominations'] = nominations
        dataset.loc[index, f'{name_column}_num_movies'] = len(ratings_list)
        dataset.loc[index, f'{name_column}_avg_rating'] = np.mean(ratings_list)

        print(f'{index}. {name}')

    return dataset

In [None]:
searching_IMDb_person('Supporting', test_df, 'actor')