# Get a list of famous actors
- The csv file obtained from the github repo : https://github.com/cckuqui/IMDB-analysis/tree/master/Original%20Data
- The specific data file: https://github.com/cckuqui/IMDB-analysis/blob/master/Original%20Data/IMDb_names.csv


ToDo:
- place the file `IMDb_names.csv` in the folder `./data/imdb_actors_dataset/` from the main repo.

In [None]:
import os
import requests
from tqdm import tqdm
import pandas as pd
import numpy as np


In [None]:
file_url = "https://github.com/cckuqui/IMDB-analysis/raw/master/Original%20Data/IMDb_names.csv"
filename = "../../data/imdb_actors_dataset/IMDb_names.csv"

download_file = True

In [None]:
if download_file:
    # Create the directory if it doesn't exist
    os.makedirs(os.path.dirname(filename), exist_ok=True)

    # Download the file and save it with a progress bar
    response = requests.get(file_url, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 1 Kilobyte
    with open(filename, 'wb') as file, tqdm(
        desc=filename,
        total=total_size,
        unit='iB',
        unit_scale=True,
        unit_divisor=1024,
    ) as bar:
        for data in response.iter_content(block_size):
            file.write(data)
            bar.update(len(data))

    print(f"File downloaded and saved to {filename}")


../../data/imdb_actors_dataset/IMDb_names.csv: 89.6MiB [00:06, 13.7MiB/s]                           

File downloaded and saved to ../../data/imdb_actors_dataset/IMDb_names.csv





In [None]:

# CSV file
filename = "../../data/imdb_actors_dataset/IMDb_names.csv"


# list files in same folder as  ../../data/
import os
print("Files in ../../data/imdb_actors_dataset:")
for file in os.listdir("../../data/imdb_actors_dataset"):
    print(file)

# check if the file exists
try:
    with open(filename) as file:
        pass
except FileNotFoundError:
    print(f"File '{filename}' not found")


# Load this file as a DataFrame
df = pd.read_csv(filename)

# Function to filter based on primary profession
def filter_profession(profession, row):
    professions = row['primary_profession']
    if isinstance(professions, str):
        professions = professions.split(',')
        return profession in professions
    return False

# Function to filter based on minimum number of known titles
def filter_titles(min_titles, row):
    titles = row['known_for_titles']
    if isinstance(titles, str):
        return len(titles.split(',')) >= min_titles
    return False

# Function to filter based on having at least two names
def filter_two_names(row):
    name = row['name']
    if isinstance(name, str):
        twonames = len(name.split()) >= 2
        if twonames:
            # check if name is more then 8 characters long
            return len(name) >= 8
        else:
            return False
    return False

# Extract actor and actress names with the new filter
actor_names = df[
    (df.apply(lambda row: filter_profession('actor', row), axis=1)) &
    (df.apply(filter_two_names, axis=1))  # Fixed: removed lambda
]['name'].values

actress_names = df[
    (df.apply(lambda row: filter_profession('actress', row), axis=1)) &
    (df.apply(filter_two_names, axis=1))  # Fixed: removed lambda
]['name'].values

# Option to filter based on minimum number of known titles
minimal_titles = 3  # You can adjust this number as needed
actor_names_filtered = df[
    (df.apply(lambda row: filter_profession('actor', row), axis=1)) &
    (df.apply(lambda row: filter_titles(minimal_titles, row), axis=1)) &
    (df.apply(filter_two_names, axis=1))  # Fixed: removed lambda
]['name'].values

actress_names_filtered = df[
    (df.apply(lambda row: filter_profession('actress', row), axis=1)) &
    (df.apply(lambda row: filter_titles(minimal_titles, row), axis=1)) &
    (df.apply(filter_two_names, axis=1))  # Fixed: removed lambda
]['name'].values

# Save the actor and actress names as numpy arrays
# result_path = "./data/"
result_path = "../../data/imdb_actors_dataset/"

np.save(result_path + 'actor_names.npy', actor_names)
np.save(result_path + 'actress_names.npy', actress_names)
np.save(result_path + 'actor_names_filtered.npy', actor_names_filtered)
np.save(result_path + 'actress_names_filtered.npy', actress_names_filtered)

# Load the names back as numpy arrays
loaded_actor_names = np.load(result_path + 'actor_names.npy', allow_pickle=True)
loaded_actress_names = np.load(result_path + 'actress_names.npy', allow_pickle=True)
loaded_actor_names_filtered = np.load(result_path + 'actor_names_filtered.npy', allow_pickle=True)
loaded_actress_names_filtered = np.load(result_path + 'actress_names_filtered.npy', allow_pickle=True)

# Display the loaded names
print("Actors (first 5):", loaded_actor_names[:5])
print("Actresses (first 5):", loaded_actress_names[:5])
print("Filtered Actors (first 5):", loaded_actor_names_filtered[:5])
print("Filtered Actresses (first 5):", loaded_actress_names_filtered[:5])

# Verify that the outputs are numpy arrays
print("\nType checks:")
print("Actors:", type(loaded_actor_names))
print("Actresses:", type(loaded_actress_names))
print("Filtered Actors:", type(loaded_actor_names_filtered))
print("Filtered Actresses:", type(loaded_actress_names_filtered))

# Additional check for single-word names
single_word_names = [name for name in loaded_actor_names if len(name.split()) == 1]
print("\nSingle-word names found:", single_word_names[:10] if single_word_names else "None")

Files in ../../data/imdb_actors_dataset:
IMDb_names.csv
Actors (first 5): ['Fred Astaire' 'John Belushi' 'Ingmar Bergman' 'Humphrey Bogart'
 'Marlon Brando']
Actresses (first 5): ['Lauren Bacall' 'Brigitte Bardot' 'Ingrid Bergman' 'Bette Davis'
 'Doris Day']
Filtered Actors (first 5): ['Fred Astaire' 'John Belushi' 'Ingmar Bergman' 'Humphrey Bogart'
 'Marlon Brando']
Filtered Actresses (first 5): ['Lauren Bacall' 'Brigitte Bardot' 'Ingrid Bergman' 'Bette Davis'
 'Doris Day']

Type checks:
Actors: <class 'numpy.ndarray'>
Actresses: <class 'numpy.ndarray'>
Filtered Actors: <class 'numpy.ndarray'>
Filtered Actresses: <class 'numpy.ndarray'>

Single-word names found: None


In [4]:
shortest_name_actor = min(actor_names, key=len)
shortest_name_actor

'Jude Law'

In [5]:
len(loaded_actor_names), len(loaded_actress_names), len(loaded_actor_names_filtered), len(loaded_actress_names_filtered)

(59280, 32482, 52536, 28266)

In [6]:
unfamour_actors = np.setdiff1d(set(loaded_actor_names), set(loaded_actor_names_filtered))

In [7]:
# loaded_actor_names_filtered[:10]

In [8]:
# unfamour_actors