In [9]:
import pickle
from multiprocessing.pool import Pool

import imdb
import numpy as np
import pandas as pd

with open("data/actors.pkl", "rb") as f:
    df = pd.DataFrame(sorted(pickle.load(f)), columns=["nconst"])

def get_country(nconst):
    ia = imdb.IMDb()
    person = ia.get_person(nconst[2:])
    country = person.data["birth info"]["birth place"].split(",")[-1].strip()
    print(country)
    return country

def get_countries(df):
    df["birth_country"] = df["nconst"].apply(get_country)
    return df

def parallelize_dataframe(df, func, n_cores=32):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

df = parallelize_dataframe(df, get_countries)
df.to_pickle("data/birth_countries.pkl")

df

Sweden
UK
USA
USA
USA
USA
USA
USA
France
USA


Unnamed: 0,nconst,birth_country
0,nm0000001,USA
1,nm0000002,USA
2,nm0000003,France
3,nm0000006,Sweden
4,nm0000007,USA
5,nm0000008,USA
6,nm0000009,UK
7,nm0000010,USA
8,nm0000011,USA
9,nm0000012,USA
